ext: add McPAT source

this patch adds the source for mcpat, a power, area, and timing modeling framework.
2014-04-01 12:44:30 -04:00 · 2014-04-01 12:44:30 -04:00 · e553a7bfa7
parent 8d665ee166
commit e553a7bfa7
104 changed files with 48876 additions and 0 deletions
--- a/ext/mcpat/ARM_A9.xml
+++ b/ext/mcpat/ARM_A9.xml
@ -0,0 +1,415 @@
 <?xml version="1.0" ?>
 <component id="root" name="root">
 	<component id="system" name="system">
 		<!--McPAT will skip the components if number is set to 0 -->
 		<param name="number_of_cores" value="2"/>
 		<param name="number_of_L1Directories" value="2"/>
 		<param name="number_of_L2Directories" value="0"/>
 		<param name="number_of_L2s" value="0"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
 		<param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent -->
 		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
 		<param name="number_of_NoCs" value="1"/>
 		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
 		<param name="homogeneous_L2s" value="1"/>
 		<param name="homogeneous_L1Directorys" value="1"/>
 		<param name="homogeneous_L2Directorys" value="1"/>
 		<param name="homogeneous_L3s" value="1"/>
 		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
 		<param name="homogeneous_NoCs" value="1"/>
 		<param name="core_tech_node" value="40"/><!-- nm -->
 		<param name="target_core_clockrate" value="2000"/><!--MHz -->
 		<param name="temperature" value="380"/> <!-- Kelvin -->
 		<param name="number_cache_levels" value="2"/>
 		<param name="interconnect_projection_type" value="1"/><!--0: agressive wire technology; 1: conservative wire technology -->
 		<param name="device_type" value="1"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
 		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate -->
 		<param name="Embedded" value="1"/><!-- Embedded processor like ARM or general purpose processors?  -->
 		<param name="machine_bits" value="32"/>
 		<param name="virtual_address_width" value="32"/>
 		<param name="physical_address_width" value="32"/>
 		<param name="virtual_memory_page_size" value="4096"/>
 		<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
 			default value is machine_bits, if not set --> 
 		<stat name="total_cycles" value="100000"/>
 		<stat name="idle_cycles" value="0"/>
 		<stat name="busy_cycles"  value="100000"/>
 			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
 			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
 		<!-- *********************** cores ******************* -->
 		<component id="system.core0" name="core0">
 			<!-- Core property -->
 			<param name="clock_rate" value="2000"/>
 			<!-- for cores with unknow timing, set to 0 to force off the opt flag -->
 			<param name="opt_local" value="1"/>
 			<param name="instruction_length" value="32"/>
 			<param name="opcode_width" value="7"/>
 			<param name="x86" value="0"/>
 			<param name="micro_opcode_width" value="8"/>
 			<param name="machine_type" value="0"/>
 			<!-- inorder/OoO; 1 inorder; 0 OOO-->
 			<param name="number_hardware_threads" value="1"/>
 			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
 			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
 			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
 			<param name="fetch_width" value="2"/>
 			<!-- fetch_width determins the size of cachelines of L1 cache block -->
 			<param name="number_instruction_fetch_ports" value="1"/>
 			<param name="decode_width" value="2"/>
 			<!-- decode_width determins the number of ports of the 
 			renaming table (both RAM and CAM) scheme -->
 			<param name="issue_width" value="4"/>
 			<param name="peak_issue_width" value="7"/>
 			<!-- issue_width determins the number of ports of Issue window and other logic 
 			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
 			<param name="commit_width" value="4"/>
 			<!-- commit_width determins the number of ports of register files -->
 			<param name="fp_issue_width" value="1"/>
 			<param name="prediction_width" value="1"/> 
 			<!-- number of branch instructions can be predicted simultannouesl-->
 			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
 			Theses parameters are reserved for future use.--> 
 			<param name="pipelines_per_core" value="1,1"/>
 			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
 			<param name="pipeline_depth" value="8,8"/>
 			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
 			<!-- issue and exe unit-->
 			<param name="ALU_per_core" value="3"/>
 			<!-- contains an adder, a shifter, and a logical unit -->
 			<param name="MUL_per_core" value="1"/>
 			<!-- For MUL and Div -->
 			<param name="FPU_per_core" value="1"/>		
 			<!-- buffer between IF and ID stage -->
 			<param name="instruction_buffer_size" value="32"/>
 			<!-- buffer between ID and sche/exe stage -->
 			<param name="decoded_stream_buffer_size" value="16"/>
 			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
 			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
 			<param name="instruction_window_size" value="20"/>
 			<param name="fp_instruction_window_size" value="15"/>
 			<!-- Numbers need to be confirmed -->
 			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
 			<param name="ROB_size" value="0"/>
 			<!-- each in-flight instruction has an entry in ROB -->
 			<!-- registers -->
 			<param name="archi_Regs_IRF_size" value="32"/>			
 			<param name="archi_Regs_FRF_size" value="32"/>
 			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
 			renaming logic is for both integer and floating point insts.  -->
 			<param name="phy_Regs_IRF_size" value="64"/>
 			<param name="phy_Regs_FRF_size" value="64"/>
 			<!-- rename logic -->
 			<param name="rename_scheme" value="0"/>
 			<!-- can be RAM based(0) or CAM based(1) rename scheme 
 			RAM-based scheme will have free list, status table;
 			CAM-based scheme have the valid bit in the data field of the CAM 
 			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
 			Detailed RAT Implementation see TR -->
 			<param name="register_windows_size" value="0"/>
 			<!-- how many windows in the windowed register file, sun processors;
 			no register windowing is used when this number is 0 -->
 			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
 			They will always try to exeute out-of-order though. -->
 			<param name="LSU_order" value="inorder"/>
 			<param name="store_buffer_size" value="4"/>
 			<!-- By default, in-order cores do not have load buffers -->
 			<param name="load_buffer_size" value="0"/>	
 			<!-- number of ports refer to sustainable concurrent memory accesses --> 
 			<param name="memory_ports" value="1"/>	
 			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
 			as well as the ports of Dcache which is connected to LSU -->	
 			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
 			<param name="RAS_size" value="32"/>						
 			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
 			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
 			<stat name="total_instructions" value="400000"/>
 			<stat name="int_instructions" value="200000"/>
 			<stat name="fp_instructions" value="100000"/>
 			<stat name="branch_instructions" value="100000"/>
 			<stat name="branch_mispredictions" value="0"/>
 			<stat name="load_instructions" value="0"/>
 			<stat name="store_instructions" value="50000"/>
 			<stat name="committed_instructions" value="400000"/>
 			<stat name="committed_int_instructions" value="200000"/>
 			<stat name="committed_fp_instructions" value="100000"/>
 			<stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
 			<!-- the following cycle stats are used for heterogeneouse cores only, 
 				please ignore them if homogeneouse cores -->
 			<stat name="total_cycles" value="100000"/>
 		    <stat name="idle_cycles" value="0"/>
 		    <stat name="busy_cycles"  value="100000"/>
 			<!-- instruction buffer stats -->
 			<!-- ROB stats, both RS and Phy based OoOs have ROB
 			performance simulator should capture the difference on accesses,
 			otherwise, McPAT has to guess based on number of commited instructions. -->
 			<stat name="ROB_reads" value="400000"/>
 			<stat name="ROB_writes" value="400000"/>
 			<!-- RAT accesses -->
 			<stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
 			<stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
 			<stat name="fp_rename_reads" value="200000"/>
 			<stat name="fp_rename_writes" value="100000"/>
 			<!-- decode and rename stage use this, should be total ic - nop -->
 			<!-- Inst window stats -->
 			<stat name="inst_window_reads" value="400000"/>
 			<stat name="inst_window_writes" value="400000"/>
 			<stat name="inst_window_wakeup_accesses" value="800000"/>
 			<stat name="fp_inst_window_reads" value="200000"/>
 			<stat name="fp_inst_window_writes" value="200000"/>
 			<stat name="fp_inst_window_wakeup_accesses" value="400000"/>
 			<!--  RF accesses -->
 			<stat name="int_regfile_reads" value="600000"/>
 			<stat name="float_regfile_reads" value="100000"/>
 			<stat name="int_regfile_writes" value="300000"/>
 			<stat name="float_regfile_writes" value="50000"/>
 			<!-- accesses to the working reg -->
 			<stat name="function_calls" value="5"/>
 			<stat name="context_switches" value="260343"/>
 			<!-- Number of Windowes switches (number of function calls and returns)-->
 			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
 			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
 			<stat name="ialu_accesses" value="300000"/>			
 			<stat name="fpu_accesses" value="100000"/>
 			<stat name="mul_accesses" value="200000"/>
 			<stat name="cdb_alu_accesses" value="300000"/>
 			<stat name="cdb_mul_accesses" value="200000"/>
 			<stat name="cdb_fpu_accesses" value="100000"/>
 			<!-- multiple cycle accesses should be counted multiple times, 
 			otherwise, McPAT can use internal counter for different floating point instructions 
 			to get final accesses. But that needs detailed info for floating point inst mix -->
 			<!--  currently the performance simulator should 
 			make sure all the numbers are final numbers, 
 			including the explicit read/write accesses, 
 			and the implicite accesses such as replacements and etc.
 			Future versions of McPAT may be able to reason the implicite access
 			based on param and stats of last level cache
 			The same rule applies to all cache access stats too!  -->
 			<!-- following is AF for max power computation. 
 				Do not change them, unless you understand them-->
 			<stat name="IFU_duty_cycle" value="1"/>			
 			<stat name="LSU_duty_cycle" value="0.5"/>
 			<stat name="MemManU_I_duty_cycle" value="1"/>
 			<stat name="MemManU_D_duty_cycle" value="0.5"/>
 			<stat name="ALU_duty_cycle" value="1"/>
 			<stat name="MUL_duty_cycle" value="0.3"/>
 			<stat name="FPU_duty_cycle" value="0.3"/>
 			<stat name="ALU_cdb_duty_cycle" value="1"/>
 			<stat name="MUL_cdb_duty_cycle" value="0.3"/>
 			<stat name="FPU_cdb_duty_cycle" value="0.3"/>
 			<param name="number_of_BPT" value="2"/>
 			<component id="system.core0.predictor" name="PBT">
 				<!-- branch predictor; tournament predictor see Alpha implementation -->
 				<param name="local_predictor_size" value="10,3"/>
 				<param name="local_predictor_entries" value="1024"/>
 				<param name="global_predictor_entries" value="4096"/>
 				<param name="global_predictor_bits" value="2"/>
 				<param name="chooser_predictor_entries" value="4096"/>
 				<param name="chooser_predictor_bits" value="2"/>
 				<!-- These parameters can be combined like below in next version
 				<param name="load_predictor" value="10,3,1024"/>
 				<param name="global_predictor" value="4096,2"/>
 				<param name="predictor_chooser" value="4096,2"/>
 				-->
 			</component>
 			<component id="system.core0.itlb" name="itlb">
 				<param name="number_entries" value="64"/>
 				<stat name="total_accesses" value="200000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
 				which is actually a replacement -->
 			</component>
 			<component id="system.core0.icache" name="icache">
 				<!-- there is no write requests to itlb although writes happen to it after miss, 
 				which is actually a replacement -->
 				<param name="icache_config" value="32768,8,4,1,10,10,32,0"/>
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy,  -->
 				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
 				<param name="buffer_sizes" value="4, 4, 4,0"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
 				<stat name="read_accesses" value="200000"/>
 				<stat name="read_misses" value="0"/>
 				<stat name="conflicts" value="0"/>				
 			</component>
 			<component id="system.core0.dtlb" name="dtlb">
 				<param name="number_entries" value="64"/><!--dual threads-->
 				<stat name="total_accesses" value="400000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<component id="system.core0.dcache" name="dcache">
 			        <!-- all the buffer related are optional -->
 				<param name="dcache_config" value="32768,8,4,1, 10,10, 32,1 "/>
 				<param name="buffer_sizes" value="4, 4, 4, 4"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="800000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<param name="number_of_BTB" value="2"/>
 			<component id="system.core0.BTB" name="BTB">
 			        <!-- all the buffer related are optional -->
 				<param name="BTB_config" value="2048,4,2, 2, 1,3"/> <!--should be 4096 + 1024 -->
 				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 				<stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
 				<stat name="write_accesses" value="0"/>
 			</component>
 	</component>
 		<component id="system.L1Directory0" name="L1Directory0">
 				<param name="Directory_type" value="0"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="2048,1,0,1, 4, 4, 8"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="2000"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="800000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="20"/>	
 			    <stat name="duty_cycle" value="0.1"/>
 		</component>
 		<component id="system.L2Directory0" name="L2Directory0">
 				<param name="Directory_type" value="1"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="2000"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="58824"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="100"/>	
 		</component>
 		<component id="system.L20" name="L20">
 			<!-- all the buffer related are optional -->
 				<param name="L2_config" value="1048576,32, 8, 8, 8, 23, 32, 1"/> 
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<param name="clockrate" value="2000"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<stat name="read_accesses" value="200000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			    <stat name="duty_cycle" value="1.0"/>	
 		</component>
 <!--**********************************************************************-->
 <component id="system.L30" name="L30">
 				<param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 				<param name="clockrate" value="800"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="11824"/>
 				<stat name="write_accesses" value="11276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 				<stat name="duty_cycle" value="1.0"/>	
 		</component>
 <!--**********************************************************************-->
 		<component id="system.NoC0" name="noc0">
 			<param name="clockrate" value="2000"/>
 			<param name="type" value="0"/>
 			<!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
 				at each time only one node can send req -->
 			<param name="horizontal_nodes" value="1"/>
 			<param name="vertical_nodes" value="1"/>
 			<param name="has_global_link" value="0"/>
 			<!-- 1 has global link, 0 does not have global link -->
 			<param name="link_throughput" value="1"/><!--w.r.t clock -->
 			<param name="link_latency" value="1"/><!--w.r.t clock -->
 			<!-- througput >= latency -->
 			<!-- Router architecture -->
 			<param name="input_ports" value="1"/>
 			<param name="output_ports" value="1"/>
 			<!-- For bus the I/O ports should be 1 -->
 			<param name="flit_bits" value="128"/>
 			<param name="chip_coverage" value="1"/>
 			<!-- When multiple NOC present, one NOC will cover part of the whole chip. 
 				chip_coverage <=1 -->
 			<param name="link_routing_over_percentage" value="0.5"/>
 			<!-- Links can route over other components or occupy whole area.
 				by default, 50% of the NoC global links routes over other 
 				components -->
 			<stat name="total_accesses" value="100000"/>
 			<!-- This is the number of total accesses within the whole network not for each router -->
 			<stat name="duty_cycle" value="1"/>
 		</component>		
 <!--**********************************************************************-->
 		<component id="system.mem" name="mem">
 			<!-- Main memory property -->
 			<param name="mem_tech_node" value="32"/>
 			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
 			<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
 			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
 			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
 			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
 			<!-- above numbers can be easily found from Wikipedia -->
 			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
 			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
 			Current McPAT assumes single DIMMs are used.--> 		
 			<param name="number_ranks" value="2"/>
 			<param name="num_banks_of_DRAM_chip" value="8"/>			
 			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
 			<param name="output_width_of_DRAM_chip" value="8"/>
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
 			<param name="burstlength_of_DRAM_chip" value="8"/>
 			<stat name="memory_accesses" value="1052"/>
 			<stat name="memory_reads" value="1052"/>
 			<stat name="memory_writes" value="1052"/>									
 		</component>
 		<component id="system.mc" name="mc">
 			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
 			<!-- current version of McPAT uses published values for base parameters of memory controller
 			improvments on MC will be added in later versions. -->
 			<param name="mc_clock" value="400"/><!--MHz-->
 			<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
 			<param name="llc_line_length" value="64"/><!--B-->
 			<param name="number_mcs" value="0"/>
 			<!-- current McPAT only supports homogeneous memory controllers -->
 			<param name="memory_channels_per_mc" value="1"/>
 			<param name="number_ranks" value="2"/>
 			<!-- # of ranks of each channel-->
 			<param name="req_window_size_per_channel" value="32"/>
 			<param name="IO_buffer_size_per_channel" value="32"/>
 			<param name="databus_width" value="128"/>
 			<param name="addressbus_width" value="51"/>
 			<!-- McPAT will add the control bus width to the addressbus width automatically -->
 			<stat name="memory_accesses" value="66666"/>
 			<stat name="memory_reads" value="33333"/>
 			<stat name="memory_writes" value="33333"/>
 			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
 			the average power per MC or per channel. This is sufficent for most application. 
 			Further trackdown can be easily added in later versions. -->  			
 		</component>
 <!--**********************************************************************-->
 	</component>
 </component>
--- a/ext/mcpat/ARM_A9_2000.xml
+++ b/ext/mcpat/ARM_A9_2000.xml
@ -0,0 +1,463 @@
 <?xml version="1.0" ?>
 <component id="root" name="root">
 	<component id="system" name="system">
 		<!--McPAT will skip the components if number is set to 0 -->
 		<!--Duty cycles in this file are set according to "ARM MPcore
 			ARchitecture performance Enhancement" in MPF Japan 2008 -->
 		<param name="number_of_cores" value="2"/>
 		<param name="number_of_L1Directories" value="2"/>
 		<param name="number_of_L2Directories" value="0"/>
 		<param name="number_of_L2s" value="0"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
 		<param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent -->
 		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
 		<param name="number_of_NoCs" value="1"/>
 		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
 		<param name="homogeneous_L2s" value="1"/>
 		<param name="homogeneous_L1Directorys" value="1"/>
 		<param name="homogeneous_L2Directorys" value="1"/>
 		<param name="homogeneous_L3s" value="1"/>
 		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
 		<param name="homogeneous_NoCs" value="1"/>
 		<param name="core_tech_node" value="22"/><!-- nm -->
 		<param name="target_core_clockrate" value="2000"/><!--MHz -->
 		<param name="temperature" value="340"/> <!-- Kelvin -->
 		<param name="number_cache_levels" value="2"/>
 		<param name="interconnect_projection_type" value="1"/><!--0: agressive wire technology; 1: conservative wire technology -->
 		<param name="device_type" value="2"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
 		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate -->
 		<param name="Embedded" value="1"/><!-- Embedded processor like ARM or general purpose processors?  -->
 		<param name="opt_clockrate" value="1"/>
 		<param name="machine_bits" value="32"/>
 		<param name="virtual_address_width" value="32"/>
 		<param name="physical_address_width" value="32"/>
 		<param name="virtual_memory_page_size" value="4096"/>
 		<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
 			default value is machine_bits, if not set --> 
 		<stat name="total_cycles" value="100000"/>
 		<stat name="idle_cycles" value="0"/>
 		<stat name="busy_cycles"  value="100000"/>
 			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
 			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
 		<!-- *********************** cores ******************* -->
 		<component id="system.core0" name="core0">
 			<!-- Core property -->
 			<param name="clock_rate" value="2000"/>
 			<!-- for cores with unknow timing, set to 0 to force off the opt flag -->
 			<param name="opt_local" value="1"/>
 			<param name="instruction_length" value="32"/>
 			<param name="opcode_width" value="7"/>
 			<param name="x86" value="0"/>
 			<param name="micro_opcode_width" value="8"/>
 			<param name="machine_type" value="0"/>
 			<!-- inorder/OoO; 1 inorder; 0 OOO-->
 			<param name="number_hardware_threads" value="1"/>
 			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
 			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
 			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
 			<param name="fetch_width" value="2"/>
 			<!-- fetch_width determins the size of cachelines of L1 cache block -->
 			<param name="number_instruction_fetch_ports" value="1"/>
 			<param name="decode_width" value="2"/>
 			<!-- decode_width determins the number of ports of the 
 			renaming table (both RAM and CAM) scheme -->
 			<param name="issue_width" value="4"/>
 			<param name="peak_issue_width" value="7"/>
 			<!-- issue_width determins the number of ports of Issue window and other logic 
 			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
 			<param name="commit_width" value="4"/>
 			<!-- commit_width determins the number of ports of register files -->
 			<param name="fp_issue_width" value="1"/>
 			<param name="prediction_width" value="1"/> 
 			<!-- number of branch instructions can be predicted simultannouesl-->
 			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
 			Theses parameters are reserved for future use.--> 
 			<param name="pipelines_per_core" value="1,1"/>
 			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
 			<param name="pipeline_depth" value="8,8"/>
 			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
 			<!-- issue and exe unit-->
 			<param name="ALU_per_core" value="3"/>
 			<!-- contains an adder, a shifter, and a logical unit -->
 			<param name="MUL_per_core" value="1"/>
 			<!-- For MUL and Div -->
 			<param name="FPU_per_core" value="1"/>		
 			<!-- buffer between IF and ID stage -->
 			<param name="instruction_buffer_size" value="32"/>
 			<!-- buffer between ID and sche/exe stage -->
 			<param name="decoded_stream_buffer_size" value="16"/>
 			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
 			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
 			<param name="instruction_window_size" value="20"/>
 			<param name="fp_instruction_window_size" value="15"/>
 			<!-- Numbers need to be confirmed -->
 			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
 			<param name="ROB_size" value="0"/>
 			<!-- each in-flight instruction has an entry in ROB -->
 			<!-- registers -->
 			<param name="archi_Regs_IRF_size" value="32"/>			
 			<param name="archi_Regs_FRF_size" value="32"/>
 			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
 			renaming logic is for both integer and floating point insts.  -->
 			<param name="phy_Regs_IRF_size" value="64"/>
 			<param name="phy_Regs_FRF_size" value="64"/>
 			<!-- rename logic -->
 			<param name="rename_scheme" value="0"/>
 			<!-- can be RAM based(0) or CAM based(1) rename scheme 
 			RAM-based scheme will have free list, status table;
 			CAM-based scheme have the valid bit in the data field of the CAM 
 			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
 			Detailed RAT Implementation see TR -->
 			<param name="register_windows_size" value="0"/>
 			<!-- how many windows in the windowed register file, sun processors;
 			no register windowing is used when this number is 0 -->
 			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
 			They will always try to exeute out-of-order though. -->
 			<param name="LSU_order" value="inorder"/>
 			<param name="store_buffer_size" value="4"/>
 			<!-- By default, in-order cores do not have load buffers -->
 			<param name="load_buffer_size" value="0"/>	
 			<!-- number of ports refer to sustainable concurrent memory accesses --> 
 			<param name="memory_ports" value="1"/>	
 			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
 			as well as the ports of Dcache which is connected to LSU -->	
 			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
 			<param name="RAS_size" value="4"/>						
 			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
 			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
 			<stat name="total_instructions" value="400000"/>
 			<stat name="int_instructions" value="200000"/>
 			<stat name="fp_instructions" value="100000"/>
 			<stat name="branch_instructions" value="100000"/>
 			<stat name="branch_mispredictions" value="0"/>
 			<stat name="load_instructions" value="0"/>
 			<stat name="store_instructions" value="50000"/>
 			<stat name="committed_instructions" value="400000"/>
 			<stat name="committed_int_instructions" value="200000"/>
 			<stat name="committed_fp_instructions" value="100000"/>
 			<stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
 			<!-- the following cycle stats are used for heterogeneouse cores only, 
 				please ignore them if homogeneouse cores -->
 			<stat name="total_cycles" value="100000"/>
 		    <stat name="idle_cycles" value="0"/>
 		    <stat name="busy_cycles"  value="100000"/>
 			<!-- instruction buffer stats -->
 			<!-- ROB stats, both RS and Phy based OoOs have ROB
 			performance simulator should capture the difference on accesses,
 			otherwise, McPAT has to guess based on number of commited instructions. -->
 			<stat name="ROB_reads" value="400000"/>
 			<stat name="ROB_writes" value="400000"/>
 			<!-- RAT accesses -->
 			<stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
 			<stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
 			<stat name="fp_rename_reads" value="200000"/>
 			<stat name="fp_rename_writes" value="100000"/>
 			<!-- decode and rename stage use this, should be total ic - nop -->
 			<!-- Inst window stats -->
 			<stat name="inst_window_reads" value="400000"/>
 			<stat name="inst_window_writes" value="400000"/>
 			<stat name="inst_window_wakeup_accesses" value="800000"/>
 			<stat name="fp_inst_window_reads" value="200000"/>
 			<stat name="fp_inst_window_writes" value="200000"/>
 			<stat name="fp_inst_window_wakeup_accesses" value="400000"/>
 			<!--  RF accesses -->
 			<stat name="int_regfile_reads" value="600000"/>
 			<stat name="float_regfile_reads" value="100000"/>
 			<stat name="int_regfile_writes" value="300000"/>
 			<stat name="float_regfile_writes" value="50000"/>
 			<!-- accesses to the working reg -->
 			<stat name="function_calls" value="5"/>
 			<stat name="context_switches" value="260343"/>
 			<!-- Number of Windowes switches (number of function calls and returns)-->
 			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
 			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
 			<stat name="ialu_accesses" value="300000"/>			
 			<stat name="fpu_accesses" value="100000"/>
 			<stat name="mul_accesses" value="200000"/>
 			<stat name="cdb_alu_accesses" value="300000"/>
 			<stat name="cdb_mul_accesses" value="200000"/>
 			<stat name="cdb_fpu_accesses" value="100000"/>
 			<!-- multiple cycle accesses should be counted multiple times, 
 			otherwise, McPAT can use internal counter for different floating point instructions 
 			to get final accesses. But that needs detailed info for floating point inst mix -->
 			<!--  currently the performance simulator should 
 			make sure all the numbers are final numbers, 
 			including the explicit read/write accesses, 
 			and the implicite accesses such as replacements and etc.
 			Future versions of McPAT may be able to reason the implicite access
 			based on param and stats of last level cache
 			The same rule applies to all cache access stats too!  -->
 			<!-- following is AF for max power computation. 
 				Do not change them, unless you understand them-->
 			<stat name="IFU_duty_cycle" value="0.9"/>
 			<stat name="BR_duty_cycle" value="0.72"/><!--branch-->			
 			<stat name="LSU_duty_cycle" value="0.71"/>
 			<stat name="MemManU_I_duty_cycle" value="0.9"/>
 			<stat name="MemManU_D_duty_cycle" value="0.71"/>
 			<stat name="ALU_duty_cycle" value="0.76"/>
 			<!-- (.78*2+.71)/3 -->
 			<stat name="MUL_duty_cycle" value="0.82"/>
 			<stat name="FPU_duty_cycle" value="0.0"/>
 			<stat name="ALU_cdb_duty_cycle" value="0.76"/>
 			<stat name="MUL_cdb_duty_cycle" value="0.82"/>
 			<stat name="FPU_cdb_duty_cycle" value="0.0"/>
 			<param name="number_of_BPT" value="2"/>
 			<component id="system.core0.predictor" name="PBT">
 				<!-- branch predictor; tournament predictor see Alpha implementation -->
 				<param name="local_predictor_size" value="10,3"/>
 				<param name="local_predictor_entries" value="4"/>
 				<param name="global_predictor_entries" value="4096"/>
 				<param name="global_predictor_bits" value="2"/>
 				<param name="chooser_predictor_entries" value="4096"/>
 				<param name="chooser_predictor_bits" value="2"/>
 				<!-- These parameters can be combined like below in next version
 				<param name="load_predictor" value="10,3,1024"/>
 				<param name="global_predictor" value="4096,2"/>
 				<param name="predictor_chooser" value="4096,2"/>
 				-->
 			</component>
 			<component id="system.core0.itlb" name="itlb">
 				<param name="number_entries" value="64"/>
 				<stat name="total_accesses" value="200000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
 				which is actually a replacement -->
 			</component>
 			<component id="system.core0.icache" name="icache">
 				<!-- there is no write requests to itlb although writes happen to it after miss, 
 				which is actually a replacement -->
 				<param name="icache_config" value="32768,8,4,1,10,10,32,0"/>
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy,  -->
 				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
 				<param name="buffer_sizes" value="4, 4, 4,0"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
 				<stat name="read_accesses" value="200000"/>
 				<stat name="read_misses" value="0"/>
 				<stat name="conflicts" value="0"/>				
 			</component>
 			<component id="system.core0.dtlb" name="dtlb">
 				<param name="number_entries" value="64"/><!--dual threads-->
 				<stat name="total_accesses" value="400000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<component id="system.core0.dcache" name="dcache">
 			        <!-- all the buffer related are optional -->
 				<param name="dcache_config" value="32768,8,4,1, 10,10, 32,1 "/>
 				<param name="buffer_sizes" value="4, 4, 4, 4"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="800000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<param name="number_of_BTB" value="2"/>
 			<component id="system.core0.BTB" name="BTB">
 			        <!-- all the buffer related are optional -->
 				<param name="BTB_config" value="4096,4,2, 2, 1,1"/> 
 				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 				<stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
 				<stat name="write_accesses" value="0"/>
 			</component>
 	</component>
 		<component id="system.L1Directory0" name="L1Directory0">
 				<param name="Directory_type" value="0"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="2048,1,0,1, 4, 4, 8"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="2000"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="2"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="800000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="20"/>	
 			    <stat name="duty_cycle" value="0.1"/>
 		</component>
 		<component id="system.L2Directory0" name="L2Directory0">
 				<param name="Directory_type" value="1"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="3400"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="58824"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="100"/>
 			    <stat name="duty_cycle" value="0.1"/>	
 		</component>
 		<component id="system.L20" name="L20">
 			<!-- all the buffer related are optional -->
 				<param name="L2_config" value="1048576,32, 8, 8, 8, 23, 32, 1"/> 
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<param name="clockrate" value="3400"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<stat name="read_accesses" value="200000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			    <stat name="duty_cycle" value="1.0"/>	
 		</component>
 <!--**********************************************************************-->
 <component id="system.L30" name="L30">
 				<param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 				<param name="clockrate" value="800"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="11824"/>
 				<stat name="write_accesses" value="11276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 				<stat name="duty_cycle" value="1.0"/>	
 		</component>
 <!--**********************************************************************-->
 		<component id="system.NoC0" name="noc0">
 			<param name="clockrate" value="2000"/>
 			<param name="type" value="0"/>
 			<!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
 				at each time only one node can send req -->
 			<param name="horizontal_nodes" value="1"/>
 			<param name="vertical_nodes" value="1"/>
 			<param name="has_global_link" value="0"/>
 			<!-- 1 has global link, 0 does not have global link -->
 			<param name="link_throughput" value="1"/><!--w.r.t clock -->
 			<param name="link_latency" value="1"/><!--w.r.t clock -->
 			<!-- througput >= latency -->
 			<!-- Router architecture -->
 			<param name="input_ports" value="1"/>
 			<param name="output_ports" value="1"/>
 			<!-- For bus the I/O ports should be 1 -->
 			<param name="flit_bits" value="64"/>
 			<param name="chip_coverage" value="1"/>
 			<!-- When multiple NOC present, one NOC will cover part of the whole chip. 
 				chip_coverage <=1 -->
 			<param name="link_routing_over_percentage" value="0.5"/>
 			<!-- Links can route over other components or occupy whole area.
 				by default, 50% of the NoC global links routes over other 
 				components -->
 			<stat name="total_accesses" value="100000"/>
 			<!-- This is the number of total accesses within the whole network not for each router -->
 			<stat name="duty_cycle" value="0.2"/>
 		</component>		
 <!--**********************************************************************-->
 		<component id="system.mem" name="mem">
 			<!-- Main memory property -->
 			<param name="mem_tech_node" value="32"/>
 			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
 			<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
 			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
 			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
 			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
 			<!-- above numbers can be easily found from Wikipedia -->
 			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
 			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
 			Current McPAT assumes single DIMMs are used.--> 		
 			<param name="number_ranks" value="2"/>
 			<param name="num_banks_of_DRAM_chip" value="8"/>			
 			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
 			<param name="output_width_of_DRAM_chip" value="8"/>
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
 			<param name="burstlength_of_DRAM_chip" value="8"/>
 			<stat name="memory_accesses" value="1052"/>
 			<stat name="memory_reads" value="1052"/>
 			<stat name="memory_writes" value="1052"/>									
 		</component>
 		<component id="system.mc" name="mc">
 			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
 			<!-- current version of McPAT uses published values for base parameters of memory controller
 			improvments on MC will be added in later versions. -->
 			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
 			<param name="mc_clock" value="400"/><!--MHz-->
 			<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
 			<param name="block_size" value="64"/><!--(B) the block size of last level cache, which is the unit for one memory burst transfer -->
 			<param name="number_mcs" value="1"/>
 			<!-- current McPAT only supports homogeneous memory controllers -->
 			<param name="memory_channels_per_mc" value="1"/>
 			<param name="number_ranks" value="0"/>
 			<!-- # of ranks of each channel-->
 			<param name="req_window_size_per_channel" value="32"/>
 			<param name="IO_buffer_size_per_channel" value="32"/>
 			<param name="databus_width" value="128"/>
 			<param name="addressbus_width" value="51"/>
 			<!-- McPAT will add the control bus width to the addressbus width automatically -->
 			<stat name="memory_accesses" value="66666"/>
 			<stat name="memory_reads" value="33333"/>
 			<stat name="memory_writes" value="33333"/>
 			<param name="withPHY" value="1"/>
 			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
 			the average power per MC or per channel. This is sufficent for most application. 
 			Further trackdown can be easily added in later versions. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.niu" name="niu">
 			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
 			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
 				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
 			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="1"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
 			the average power per nic or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.pcie" name="pcie">
 			<!-- On chip PCIe controller, including Phy-->
 			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
 				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
 			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
 			<param name="withPHY" value="1"/>
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="1"/>
 			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
 			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.flashc" name="flashc">
 		    <param name="number_flashcs" value="1"/>
 			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
            <param name="withPHY" value="1"/>
 			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
 			the average power per fc or per channel. This is sufficent for most application -->  			
 		</component>
 <!--**********************************************************************-->
 		</component>
 </component>
--- a/ext/mcpat/ARM_A9_800.xml
+++ b/ext/mcpat/ARM_A9_800.xml
@ -0,0 +1,463 @@
 <?xml version="1.0" ?>
 <component id="root" name="root">
 	<component id="system" name="system">
 		<!--McPAT will skip the components if number is set to 0 -->
 		<!--Duty cycles in this file are set according to "ARM MPcore
 			ARchitecture performance Enhancement" in MPF Japan 2008 -->
 		<param name="number_of_cores" value="2"/>
 		<param name="number_of_L1Directories" value="2"/>
 		<param name="number_of_L2Directories" value="0"/>
 		<param name="number_of_L2s" value="0"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
 		<param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent -->
 		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
 		<param name="number_of_NoCs" value="1"/>
 		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
 		<param name="homogeneous_L2s" value="1"/>
 		<param name="homogeneous_L1Directorys" value="1"/>
 		<param name="homogeneous_L2Directorys" value="1"/>
 		<param name="homogeneous_L3s" value="1"/>
 		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
 		<param name="homogeneous_NoCs" value="1"/>
 		<param name="core_tech_node" value="32"/><!-- nm -->
 		<param name="target_core_clockrate" value="800"/><!--MHz -->
 		<param name="temperature" value="340"/> <!-- Kelvin -->
 		<param name="number_cache_levels" value="2"/>
 		<param name="interconnect_projection_type" value="1"/><!--0: agressive wire technology; 1: conservative wire technology -->
 		<param name="device_type" value="2"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
 		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate -->
 		<param name="Embedded" value="1"/><!-- Embedded processor like ARM or general purpose processors?  -->
 		<param name="opt_clockrate" value="0"/>
 		<param name="machine_bits" value="32"/>
 		<param name="virtual_address_width" value="32"/>
 		<param name="physical_address_width" value="32"/>
 		<param name="virtual_memory_page_size" value="4096"/>
 		<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
 			default value is machine_bits, if not set --> 
 		<stat name="total_cycles" value="100000"/>
 		<stat name="idle_cycles" value="0"/>
 		<stat name="busy_cycles"  value="100000"/>
 			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
 			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
 		<!-- *********************** cores ******************* -->
 		<component id="system.core0" name="core0">
 			<!-- Core property -->
 			<param name="clock_rate" value="800"/>
 			<!-- for cores with unknow timing, set to 0 to force off the opt flag -->
 			<param name="opt_local" value="1"/>
 			<param name="instruction_length" value="32"/>
 			<param name="opcode_width" value="7"/>
 			<param name="x86" value="0"/>
 			<param name="micro_opcode_width" value="8"/>
 			<param name="machine_type" value="0"/>
 			<!-- inorder/OoO; 1 inorder; 0 OOO-->
 			<param name="number_hardware_threads" value="1"/>
 			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
 			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
 			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
 			<param name="fetch_width" value="2"/>
 			<!-- fetch_width determins the size of cachelines of L1 cache block -->
 			<param name="number_instruction_fetch_ports" value="1"/>
 			<param name="decode_width" value="2"/>
 			<!-- decode_width determins the number of ports of the 
 			renaming table (both RAM and CAM) scheme -->
 			<param name="issue_width" value="4"/>
 			<param name="peak_issue_width" value="7"/>
 			<!-- issue_width determins the number of ports of Issue window and other logic 
 			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
 			<param name="commit_width" value="4"/>
 			<!-- commit_width determins the number of ports of register files -->
 			<param name="fp_issue_width" value="1"/>
 			<param name="prediction_width" value="1"/> 
 			<!-- number of branch instructions can be predicted simultannouesl-->
 			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
 			Theses parameters are reserved for future use.--> 
 			<param name="pipelines_per_core" value="1,1"/>
 			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
 			<param name="pipeline_depth" value="8,8"/>
 			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
 			<!-- issue and exe unit-->
 			<param name="ALU_per_core" value="3"/>
 			<!-- contains an adder, a shifter, and a logical unit -->
 			<param name="MUL_per_core" value="1"/>
 			<!-- For MUL and Div -->
 			<param name="FPU_per_core" value="1"/>		
 			<!-- buffer between IF and ID stage -->
 			<param name="instruction_buffer_size" value="32"/>
 			<!-- buffer between ID and sche/exe stage -->
 			<param name="decoded_stream_buffer_size" value="16"/>
 			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
 			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
 			<param name="instruction_window_size" value="20"/>
 			<param name="fp_instruction_window_size" value="15"/>
 			<!-- Numbers need to be confirmed -->
 			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
 			<param name="ROB_size" value="0"/>
 			<!-- each in-flight instruction has an entry in ROB -->
 			<!-- registers -->
 			<param name="archi_Regs_IRF_size" value="32"/>			
 			<param name="archi_Regs_FRF_size" value="32"/>
 			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
 			renaming logic is for both integer and floating point insts.  -->
 			<param name="phy_Regs_IRF_size" value="64"/>
 			<param name="phy_Regs_FRF_size" value="64"/>
 			<!-- rename logic -->
 			<param name="rename_scheme" value="0"/>
 			<!-- can be RAM based(0) or CAM based(1) rename scheme 
 			RAM-based scheme will have free list, status table;
 			CAM-based scheme have the valid bit in the data field of the CAM 
 			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
 			Detailed RAT Implementation see TR -->
 			<param name="register_windows_size" value="0"/>
 			<!-- how many windows in the windowed register file, sun processors;
 			no register windowing is used when this number is 0 -->
 			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
 			They will always try to exeute out-of-order though. -->
 			<param name="LSU_order" value="inorder"/>
 			<param name="store_buffer_size" value="4"/>
 			<!-- By default, in-order cores do not have load buffers -->
 			<param name="load_buffer_size" value="0"/>	
 			<!-- number of ports refer to sustainable concurrent memory accesses --> 
 			<param name="memory_ports" value="1"/>	
 			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
 			as well as the ports of Dcache which is connected to LSU -->	
 			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
 			<param name="RAS_size" value="4"/>						
 			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
 			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
 			<stat name="total_instructions" value="400000"/>
 			<stat name="int_instructions" value="200000"/>
 			<stat name="fp_instructions" value="100000"/>
 			<stat name="branch_instructions" value="100000"/>
 			<stat name="branch_mispredictions" value="0"/>
 			<stat name="load_instructions" value="0"/>
 			<stat name="store_instructions" value="50000"/>
 			<stat name="committed_instructions" value="400000"/>
 			<stat name="committed_int_instructions" value="200000"/>
 			<stat name="committed_fp_instructions" value="100000"/>
 			<stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
 			<!-- the following cycle stats are used for heterogeneouse cores only, 
 				please ignore them if homogeneouse cores -->
 			<stat name="total_cycles" value="100000"/>
 		    <stat name="idle_cycles" value="0"/>
 		    <stat name="busy_cycles"  value="100000"/>
 			<!-- instruction buffer stats -->
 			<!-- ROB stats, both RS and Phy based OoOs have ROB
 			performance simulator should capture the difference on accesses,
 			otherwise, McPAT has to guess based on number of commited instructions. -->
 			<stat name="ROB_reads" value="400000"/>
 			<stat name="ROB_writes" value="400000"/>
 			<!-- RAT accesses -->
 			<stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
 			<stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
 			<stat name="fp_rename_reads" value="200000"/>
 			<stat name="fp_rename_writes" value="100000"/>
 			<!-- decode and rename stage use this, should be total ic - nop -->
 			<!-- Inst window stats -->
 			<stat name="inst_window_reads" value="400000"/>
 			<stat name="inst_window_writes" value="400000"/>
 			<stat name="inst_window_wakeup_accesses" value="800000"/>
 			<stat name="fp_inst_window_reads" value="200000"/>
 			<stat name="fp_inst_window_writes" value="200000"/>
 			<stat name="fp_inst_window_wakeup_accesses" value="400000"/>
 			<!--  RF accesses -->
 			<stat name="int_regfile_reads" value="600000"/>
 			<stat name="float_regfile_reads" value="100000"/>
 			<stat name="int_regfile_writes" value="300000"/>
 			<stat name="float_regfile_writes" value="50000"/>
 			<!-- accesses to the working reg -->
 			<stat name="function_calls" value="5"/>
 			<stat name="context_switches" value="260343"/>
 			<!-- Number of Windowes switches (number of function calls and returns)-->
 			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
 			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
 			<stat name="ialu_accesses" value="300000"/>			
 			<stat name="fpu_accesses" value="100000"/>
 			<stat name="mul_accesses" value="200000"/>
 			<stat name="cdb_alu_accesses" value="300000"/>
 			<stat name="cdb_mul_accesses" value="200000"/>
 			<stat name="cdb_fpu_accesses" value="100000"/>
 			<!-- multiple cycle accesses should be counted multiple times, 
 			otherwise, McPAT can use internal counter for different floating point instructions 
 			to get final accesses. But that needs detailed info for floating point inst mix -->
 			<!--  currently the performance simulator should 
 			make sure all the numbers are final numbers, 
 			including the explicit read/write accesses, 
 			and the implicite accesses such as replacements and etc.
 			Future versions of McPAT may be able to reason the implicite access
 			based on param and stats of last level cache
 			The same rule applies to all cache access stats too!  -->
 			<!-- following is AF for max power computation. 
 				Do not change them, unless you understand them-->
 			<stat name="IFU_duty_cycle" value="0.9"/>
 			<stat name="BR_duty_cycle" value="0.72"/><!--branch-->			
 			<stat name="LSU_duty_cycle" value="0.71"/>
 			<stat name="MemManU_I_duty_cycle" value="0.9"/>
 			<stat name="MemManU_D_duty_cycle" value="0.71"/>
 			<stat name="ALU_duty_cycle" value="0.76"/>
 			<!-- (.78*2+.71)/3 -->
 			<stat name="MUL_duty_cycle" value="0.82"/>
 			<stat name="FPU_duty_cycle" value="0.0"/>
 			<stat name="ALU_cdb_duty_cycle" value="0.76"/>
 			<stat name="MUL_cdb_duty_cycle" value="0.82"/>
 			<stat name="FPU_cdb_duty_cycle" value="0.0"/>
 			<param name="number_of_BPT" value="2"/>
 			<component id="system.core0.predictor" name="PBT">
 				<!-- branch predictor; tournament predictor see Alpha implementation -->
 				<param name="local_predictor_size" value="10,3"/>
 				<param name="local_predictor_entries" value="4"/>
 				<param name="global_predictor_entries" value="4096"/>
 				<param name="global_predictor_bits" value="2"/>
 				<param name="chooser_predictor_entries" value="4096"/>
 				<param name="chooser_predictor_bits" value="2"/>
 				<!-- These parameters can be combined like below in next version
 				<param name="load_predictor" value="10,3,1024"/>
 				<param name="global_predictor" value="4096,2"/>
 				<param name="predictor_chooser" value="4096,2"/>
 				-->
 			</component>
 			<component id="system.core0.itlb" name="itlb">
 				<param name="number_entries" value="64"/>
 				<stat name="total_accesses" value="200000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
 				which is actually a replacement -->
 			</component>
 			<component id="system.core0.icache" name="icache">
 				<!-- there is no write requests to itlb although writes happen to it after miss, 
 				which is actually a replacement -->
 				<param name="icache_config" value="32768,8,4,1,10,10,32,0"/>
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy,  -->
 				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
 				<param name="buffer_sizes" value="4, 4, 4,0"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
 				<stat name="read_accesses" value="200000"/>
 				<stat name="read_misses" value="0"/>
 				<stat name="conflicts" value="0"/>				
 			</component>
 			<component id="system.core0.dtlb" name="dtlb">
 				<param name="number_entries" value="64"/><!--dual threads-->
 				<stat name="total_accesses" value="400000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<component id="system.core0.dcache" name="dcache">
 			        <!-- all the buffer related are optional -->
 				<param name="dcache_config" value="32768,8,4,1, 10,10, 32,1 "/>
 				<param name="buffer_sizes" value="4, 4, 4, 4"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="800000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<param name="number_of_BTB" value="2"/>
 			<component id="system.core0.BTB" name="BTB">
 			        <!-- all the buffer related are optional -->
 				<param name="BTB_config" value="4096,4,2, 2, 1,1"/> 
 				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 				<stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
 				<stat name="write_accesses" value="0"/>
 			</component>
 	</component>
 		<component id="system.L1Directory0" name="L1Directory0">
 				<param name="Directory_type" value="0"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="2048,1,0,1, 4, 4, 8"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="800"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="2"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="800000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="20"/>	
 			    <stat name="duty_cycle" value="0.1"/>
 		</component>
 		<component id="system.L2Directory0" name="L2Directory0">
 				<param name="Directory_type" value="1"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="3400"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="58824"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="100"/>
 			    <stat name="duty_cycle" value="0.1"/>	
 		</component>
 		<component id="system.L20" name="L20">
 			<!-- all the buffer related are optional -->
 				<param name="L2_config" value="1048576,32, 8, 8, 8, 23, 32, 1"/> 
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<param name="clockrate" value="3400"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<stat name="read_accesses" value="200000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			    <stat name="duty_cycle" value="1.0"/>	
 		</component>
 <!--**********************************************************************-->
 <component id="system.L30" name="L30">
 				<param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 				<param name="clockrate" value="800"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="11824"/>
 				<stat name="write_accesses" value="11276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 				<stat name="duty_cycle" value="1.0"/>	
 		</component>
 <!--**********************************************************************-->
 		<component id="system.NoC0" name="noc0">
 			<param name="clockrate" value="800"/>
 			<param name="type" value="0"/>
 			<!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
 				at each time only one node can send req -->
 			<param name="horizontal_nodes" value="1"/>
 			<param name="vertical_nodes" value="1"/>
 			<param name="has_global_link" value="0"/>
 			<!-- 1 has global link, 0 does not have global link -->
 			<param name="link_throughput" value="1"/><!--w.r.t clock -->
 			<param name="link_latency" value="1"/><!--w.r.t clock -->
 			<!-- througput >= latency -->
 			<!-- Router architecture -->
 			<param name="input_ports" value="1"/>
 			<param name="output_ports" value="1"/>
 			<!-- For bus the I/O ports should be 1 -->
 			<param name="flit_bits" value="64"/>
 			<param name="chip_coverage" value="1"/>
 			<!-- When multiple NOC present, one NOC will cover part of the whole chip. 
 				chip_coverage <=1 -->
 			<param name="link_routing_over_percentage" value="0.5"/>
 			<!-- Links can route over other components or occupy whole area.
 				by default, 50% of the NoC global links routes over other 
 				components -->
 			<stat name="total_accesses" value="100000"/>
 			<!-- This is the number of total accesses within the whole network not for each router -->
 			<stat name="duty_cycle" value="0.2"/>
 		</component>		
 <!--**********************************************************************-->
 		<component id="system.mem" name="mem">
 			<!-- Main memory property -->
 			<param name="mem_tech_node" value="32"/>
 			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
 			<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
 			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
 			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
 			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
 			<!-- above numbers can be easily found from Wikipedia -->
 			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
 			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
 			Current McPAT assumes single DIMMs are used.--> 		
 			<param name="number_ranks" value="2"/>
 			<param name="num_banks_of_DRAM_chip" value="8"/>			
 			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
 			<param name="output_width_of_DRAM_chip" value="8"/>
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
 			<param name="burstlength_of_DRAM_chip" value="8"/>
 			<stat name="memory_accesses" value="1052"/>
 			<stat name="memory_reads" value="1052"/>
 			<stat name="memory_writes" value="1052"/>									
 		</component>
 		<component id="system.mc" name="mc">
 			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
 			<!-- current version of McPAT uses published values for base parameters of memory controller
 			improvments on MC will be added in later versions. -->
 			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
 			<param name="mc_clock" value="400"/><!--MHz-->
 			<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
 			<param name="block_size" value="64"/><!--(B) the block size of last level cache, which is the unit for one memory burst transfer -->
 			<param name="number_mcs" value="0"/>
 			<!-- current McPAT only supports homogeneous memory controllers -->
 			<param name="memory_channels_per_mc" value="1"/>
 			<param name="number_ranks" value="0"/>
 			<!-- # of ranks of each channel-->
 			<param name="req_window_size_per_channel" value="32"/>
 			<param name="IO_buffer_size_per_channel" value="32"/>
 			<param name="databus_width" value="128"/>
 			<param name="addressbus_width" value="51"/>
 			<!-- McPAT will add the control bus width to the addressbus width automatically -->
 			<stat name="memory_accesses" value="66666"/>
 			<stat name="memory_reads" value="33333"/>
 			<stat name="memory_writes" value="33333"/>
 			<param name="withPHY" value="1"/>
 			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
 			the average power per MC or per channel. This is sufficent for most application. 
 			Further trackdown can be easily added in later versions. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.niu" name="niu">
 			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
 			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
 				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
 			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
 			the average power per nic or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.pcie" name="pcie">
 			<!-- On chip PCIe controller, including Phy-->
 			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
 				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
 			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
 			<param name="withPHY" value="1"/>
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="0"/>
 			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
 			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.flashc" name="flashc">
 		    <param name="number_flashcs" value="0"/>
 			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
            <param name="withPHY" value="1"/>
 			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
 			the average power per fc or per channel. This is sufficent for most application -->  			
 		</component>
 <!--**********************************************************************-->
 		</component>
 </component>
--- a/ext/mcpat/Alpha21364.xml
+++ b/ext/mcpat/Alpha21364.xml
@ -0,0 +1,456 @@
 <?xml version="1.0" ?>
 <component id="root" name="root">
 	<component id="system" name="system">
 		<!--McPAT will skip the components if number is set to 0 -->
 		<param name="number_of_cores" value="1"/>
 		<param name="number_of_L1Directories" value="0"/>
 		<param name="number_of_L2Directories" value="1"/>
 		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
 		<param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent -->
 		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
 		<param name="number_of_NoCs" value="1"/>
 		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
 		<param name="homogeneous_L2s" value="1"/>
 		<param name="homogeneous_L1Directorys" value="1"/>
 		<param name="homogeneous_L2Directorys" value="1"/>
 		<param name="homogeneous_L3s" value="1"/>
 		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
 		<param name="homogeneous_NoCs" value="1"/>
 		<param name="core_tech_node" value="90"/><!-- nm -->
 		<param name="target_core_clockrate" value="1200"/><!--MHz -->
 		<param name="temperature" value="380"/> <!-- Kelvin -->
 		<param name="number_cache_levels" value="2"/>
 		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
 		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
 		<param name="longer_channel_device" value="0"/><!-- 0 no use; 1 use when approperiate -->
 		<param name="machine_bits" value="64"/>
 		<param name="virtual_address_width" value="64"/>
 		<param name="physical_address_width" value="52"/>
 		<param name="virtual_memory_page_size" value="4096"/>
 		<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
 			default value is machine_bits, if not set --> 
 		<stat name="total_cycles" value="100000"/>
 		<stat name="idle_cycles" value="0"/>
 		<stat name="busy_cycles"  value="100000"/>
 			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
 			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
 		<!-- *********************** cores ******************* -->
 		<component id="system.core0" name="core0">
 			<!-- Core property -->
 			<param name="clock_rate" value="1200"/>
 			<!-- for cores with unknow timing, set to 0 to force off the opt flag -->
 			<param name="opt_local" value="1"/>
 			<param name="instruction_length" value="32"/>
 			<param name="opcode_width" value="7"/>
 			<param name="x86" value="0"/>
 			<param name="micro_opcode_width" value="8"/>
 			<param name="machine_type" value="0"/>
 			<!-- inorder/OoO; 1 inorder; 0 OOO-->
 			<param name="number_hardware_threads" value="1"/>
 			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
 			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
 			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
 			<param name="fetch_width" value="4"/>
 			<!-- fetch_width determins the size of cachelines of L1 cache block -->
 			<param name="number_instruction_fetch_ports" value="1"/>
 			<param name="decode_width" value="4"/>
 			<!-- decode_width determins the number of ports of the 
 			renaming table (both RAM and CAM) scheme -->
 			<param name="issue_width" value="4"/>
 			<param name="peak_issue_width" value="6"/>
 			<!-- issue_width determins the number of ports of Issue window and other logic 
 			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
 			<param name="commit_width" value="4"/>
 			<!-- commit_width determins the number of ports of register files -->
 			<param name="fp_issue_width" value="2"/>
 			<param name="prediction_width" value="1"/> 
 			<!-- number of branch instructions can be predicted simultannouesl-->
 			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
 			Theses parameters are reserved for future use.--> 
 			<param name="pipelines_per_core" value="1,1"/>
 			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
 			<param name="pipeline_depth" value="7,7"/>
 			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
 			<!-- issue and exe unit-->
 			<param name="ALU_per_core" value="4"/>
 			<!-- contains an adder, a shifter, and a logical unit -->
 			<param name="MUL_per_core" value="0"/>
 			<!-- For MUL and Div -->
 			<param name="FPU_per_core" value="1"/>		
 			<!-- buffer between IF and ID stage -->
 			<param name="instruction_buffer_size" value="32"/>
 			<!-- buffer between ID and sche/exe stage -->
 			<param name="decoded_stream_buffer_size" value="16"/>
 			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
 			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
 			<param name="instruction_window_size" value="20"/>
 			<param name="fp_instruction_window_size" value="15"/>
 			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
 			<param name="ROB_size" value="80"/>
 			<!-- each in-flight instruction has an entry in ROB -->
 			<!-- registers -->
 			<param name="archi_Regs_IRF_size" value="32"/>		
 			<param name="archi_Regs_FRF_size" value="32"/>
 			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
 			renaming logic is for both integer and floating point insts.  -->
 			<param name="phy_Regs_IRF_size" value="80"/>
 			<param name="phy_Regs_FRF_size" value="72"/>
 			<!-- rename logic -->
 			<param name="rename_scheme" value="1"/>
 			<!-- can be RAM based(0) or CAM based(1) rename scheme 
 			RAM-based scheme will have free list, status table;
 			CAM-based scheme have the valid bit in the data field of the CAM 
 			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
 			Detailed RAT Implementation see TR -->
 			<param name="register_windows_size" value="0"/>
 			<!-- how many windows in the windowed register file, sun processors;
 			no register windowing is used when this number is 0 -->
 			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
 			They will always try to exeute out-of-order though. -->
 			<param name="LSU_order" value="inorder"/>
 			<param name="store_buffer_size" value="32"/>
 			<!-- By default, in-order cores do not have load buffers -->
 			<param name="load_buffer_size" value="32"/>	
 			<!-- number of ports refer to sustainable concurrent memory accesses --> 
 			<param name="memory_ports" value="2"/>	
 			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
 			as well as the ports of Dcache which is connected to LSU -->	
 			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
 			<param name="RAS_size" value="32"/>						
 			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
 			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
 			<stat name="total_instructions" value="400000"/>
 			<stat name="int_instructions" value="200000"/>
 			<stat name="fp_instructions" value="100000"/>
 			<stat name="branch_instructions" value="100000"/>
 			<stat name="branch_mispredictions" value="0"/>
 			<stat name="load_instructions" value="0"/>
 			<stat name="store_instructions" value="50000"/>
 			<stat name="committed_instructions" value="400000"/>
 			<stat name="committed_int_instructions" value="200000"/>
 			<stat name="committed_fp_instructions" value="100000"/>
 			<stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
 			<!-- the following cycle stats are used for heterogeneouse cores only, 
 				please ignore them if homogeneouse cores -->
 			<stat name="total_cycles" value="100000"/>
 		    <stat name="idle_cycles" value="0"/>
 		    <stat name="busy_cycles"  value="100000"/>
 			<!-- instruction buffer stats -->
 			<!-- ROB stats, both RS and Phy based OoOs have ROB
 			performance simulator should capture the difference on accesses,
 			otherwise, McPAT has to guess based on number of commited instructions. -->
 			<stat name="ROB_reads" value="400000"/>
 			<stat name="ROB_writes" value="400000"/>
 			<!-- RAT accesses -->
 			<stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
 			<stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
 			<stat name="fp_rename_reads" value="200000"/>
 			<stat name="fp_rename_writes" value="100000"/>
 			<!-- decode and rename stage use this, should be total ic - nop -->
 			<!-- Inst window stats -->
 			<stat name="inst_window_reads" value="400000"/>
 			<stat name="inst_window_writes" value="400000"/>
 			<stat name="inst_window_wakeup_accesses" value="800000"/>
 			<stat name="fp_inst_window_reads" value="200000"/>
 			<stat name="fp_inst_window_writes" value="200000"/>
 			<stat name="fp_inst_window_wakeup_accesses" value="400000"/>
 			<!--  RF accesses -->
 			<stat name="int_regfile_reads" value="600000"/>
 			<stat name="float_regfile_reads" value="100000"/>
 			<stat name="int_regfile_writes" value="300000"/>
 			<stat name="float_regfile_writes" value="50000"/>
 			<!-- accesses to the working reg -->
 			<stat name="function_calls" value="5"/>
 			<stat name="context_switches" value="260343"/>
 			<!-- Number of Windowes switches (number of function calls and returns)-->
 			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
 			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
 			<stat name="ialu_accesses" value="300000"/>			
 			<stat name="fpu_accesses" value="100000"/>
 			<stat name="mul_accesses" value="200000"/>
 			<stat name="cdb_alu_accesses" value="300000"/>
 			<stat name="cdb_mul_accesses" value="200000"/>
 			<stat name="cdb_fpu_accesses" value="100000"/>
 			<!-- multiple cycle accesses should be counted multiple times, 
 			otherwise, McPAT can use internal counter for different floating point instructions 
 			to get final accesses. But that needs detailed info for floating point inst mix -->
 			<!--  currently the performance simulator should 
 			make sure all the numbers are final numbers, 
 			including the explicit read/write accesses, 
 			and the implicite accesses such as replacements and etc.
 			Future versions of McPAT may be able to reason the implicite access
 			based on param and stats of last level cache
 			The same rule applies to all cache access stats too!  -->
 			<!-- following is AF for max power computation. 
 				Do not change them, unless you understand them-->
 			<stat name="IFU_duty_cycle" value="1"/>			
 			<stat name="LSU_duty_cycle" value="1"/>
 			<stat name="MemManU_I_duty_cycle" value="1"/>
 			<stat name="MemManU_D_duty_cycle" value="1"/>
 			<stat name="ALU_duty_cycle" value="1"/>
 			<stat name="MUL_duty_cycle" value="0.3"/>
 			<stat name="FPU_duty_cycle" value="1"/>
 			<stat name="ALU_cdb_duty_cycle" value="1"/>
 			<stat name="MUL_cdb_duty_cycle" value="0.3"/>
 			<stat name="FPU_cdb_duty_cycle" value="1"/>
 			<param name="number_of_BPT" value="2"/>
 			<component id="system.core0.predictor" name="PBT">
 				<!-- branch predictor; tournament predictor see Alpha implementation -->
 				<param name="local_predictor_size" value="10,3"/>
 				<param name="local_predictor_entries" value="1024"/>
 				<param name="global_predictor_entries" value="4096"/>
 				<param name="global_predictor_bits" value="2"/>
 				<param name="chooser_predictor_entries" value="4096"/>
 				<param name="chooser_predictor_bits" value="2"/>
 				<!-- These parameters can be combined like below in next version
 				<param name="load_predictor" value="10,3,1024"/>
 				<param name="global_predictor" value="4096,2"/>
 				<param name="predictor_chooser" value="4096,2"/>
 				-->
 			</component>
 			<component id="system.core0.itlb" name="itlb">
 				<param name="number_entries" value="128"/>
 				<stat name="total_accesses" value="200000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
 				which is actually a replacement -->
 			</component>
 			<component id="system.core0.icache" name="icache">
 				<!-- there is no write requests to itlb although writes happen to it after miss, 
 				which is actually a replacement -->
 				<param name="icache_config" value="65536,16,2,1,1,2,16,0"/>
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy,  -->
 				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
 				<param name="buffer_sizes" value="16, 16, 16,0"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
 				<stat name="read_accesses" value="200000"/>
 				<stat name="read_misses" value="0"/>
 				<stat name="conflicts" value="0"/>				
 			</component>
 			<component id="system.core0.dtlb" name="dtlb">
 				<param name="number_entries" value="128"/><!--dual threads-->
 				<stat name="total_accesses" value="400000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<component id="system.core0.dcache" name="dcache">
 			        <!-- all the buffer related are optional -->
 				<param name="dcache_config" value="65536,16,2,1,1,3,16,0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="800000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<param name="number_of_BTB" value="2"/>
 			<component id="system.core0.BTB" name="BTB">
 			        <!-- all the buffer related are optional -->
 				<param name="BTB_config" value="6144,4,2,1, 1,3"/> <!--48Kbits -->
 				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 				<stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
 				<stat name="write_accesses" value="0"/>
 			</component>
 	</component>
 		<component id="system.L1Directory0" name="L1Directory0">
 				<param name="Directory_type" value="0"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="4096,2,0,1,100,100, 8"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="3400"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="800000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="20"/>	
 		</component>
 		<component id="system.L2Directory0" name="L2Directory0">
 				<param name="Directory_type" value="0"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="512,4,0,1,1, 1"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="16, 16, 16, 16"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="1200"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="58824"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="100"/>	
 		</component>
 		<component id="system.L20" name="L20">
 			<!-- all the buffer related are optional -->
 				<param name="L2_config" value="1835008,16, 8, 16, 32, 32, 12, 1"/> 
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<param name="clockrate" value="1200"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<stat name="read_accesses" value="200000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			    <stat name="duty_cycle" value="1.0"/>	
 		</component>
 <!--**********************************************************************-->
 <component id="system.L30" name="L30">
 				<param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 				<param name="clockrate" value="850"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="11824"/>
 				<stat name="write_accesses" value="11276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 				<stat name="duty_cycle" value="1.0"/>	
 		</component>
 <!--**********************************************************************-->
 		<component id="system.NoC0" name="noc0">
 			<param name="clockrate" value="1200"/>
 			<param name="type" value="1"/>
 			<!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
 				at each time only one node can send req -->
 			<param name="horizontal_nodes" value="1"/>
 			<param name="vertical_nodes" value="1"/>
 			<param name="has_global_link" value="1"/>
 			<!-- 1 has global link, 0 does not have global link -->
 			<param name="link_throughput" value="1"/><!--w.r.t clock -->
 			<param name="link_latency" value="1"/><!--w.r.t clock -->
 			<!-- througput >= latency -->
 			<!-- Router architecture -->
 			<param name="input_ports" value="8"/>
 			<param name="output_ports" value="7"/>
 			<!-- For bus the I/O ports should be 1 -->
 			<param name="virtual_channel_per_port" value="2"/>
 			<param name="input_buffer_entries_per_vc" value="128"/>
 			<param name="flit_bits" value="40"/>
 			<param name="chip_coverage" value="1"/>
 			<!-- When multiple NOC present, one NOC will cover part of the whole chip. 
 				chip_coverage <=1 -->
 			<param name="link_routing_over_percentage" value="1.0"/>
 			<!-- Links can route over other components or occupy whole area.
 				by default, 50% of the NoC global links routes over other 
 				components -->
 			<stat name="total_accesses" value="100000"/>
 			<!-- This is the number of total accesses within the whole network not for each router -->
 			<stat name="duty_cycle" value="1"/>
 		</component>		
 <!--**********************************************************************-->
 		<component id="system.mem" name="mem">
 			<!-- Main memory property -->
 			<param name="mem_tech_node" value="180"/>
 			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
 			<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
 			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
 			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
 			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
 			<!-- above numbers can be easily found from Wikipedia -->
 			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
 			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
 			Current McPAT assumes single DIMMs are used.--> 		
 			<param name="number_ranks" value="2"/>
 			<param name="num_banks_of_DRAM_chip" value="8"/>			
 			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
 			<param name="output_width_of_DRAM_chip" value="8"/>
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
 			<param name="burstlength_of_DRAM_chip" value="8"/>
 			<stat name="memory_accesses" value="1052"/>
 			<stat name="memory_reads" value="1052"/>
 			<stat name="memory_writes" value="1052"/>									
 		</component>
 		<component id="system.mc" name="mc">
 			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
 			<!-- current version of McPAT uses published values for base parameters of memory controller
 			improvments on MC will be added in later versions. -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="mc_clock" value="800"/><!--MHz-->
 			<param name="peak_transfer_rate" value="1600"/><!--MB/S-->
 			<param name="block_size" value="16"/><!--B-->
 			<param name="number_mcs" value="2"/>
 			<!-- current McPAT only supports homogeneous memory controllers -->
 			<param name="memory_channels_per_mc" value="2"/>
 			<param name="number_ranks" value="2"/>
 			<param name="withPHY" value="0"/>
 			<!-- # of ranks of each channel-->
 			<param name="req_window_size_per_channel" value="32"/>
 			<param name="IO_buffer_size_per_channel" value="32"/>
 			<param name="databus_width" value="32"/>
 			<param name="addressbus_width" value="32"/>
 			<!-- McPAT will add the control bus width to the addressbus width automatically -->
 			<stat name="memory_accesses" value="6666"/>
 			<stat name="memory_reads" value="3333"/>
 			<stat name="memory_writes" value="3333"/>
 			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
 			the average power per MC or per channel. This is sufficent for most application. 
 			Further trackdown can be easily added in later versions. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.niu" name="niu">
 			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
 			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
 				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
 			the average power per nic or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.pcie" name="pcie">
 			<!-- On chip PCIe controller, including Phy-->
 			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
 				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="withPHY" value="1"/>
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="0"/>
 			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
 			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.flashc" name="flashc">
 		    <param name="number_flashcs" value="0"/>
 			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
            <param name="withPHY" value="1"/>
 			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
 			the average power per fc or per channel. This is sufficent for most application -->  			
 		</component>
 <!--**********************************************************************-->
 		</component>
 </component>
--- a/ext/mcpat/Niagara1.xml
+++ b/ext/mcpat/Niagara1.xml
@ -0,0 +1,442 @@
 <?xml version="1.0" ?>
 <component id="root" name="root">
 	<component id="system" name="system">
 		<!--McPAT will skip the components if number is set to 0 -->
 		<param name="number_of_cores" value="8"/>
 		<param name="number_of_L1Directories" value="4"/>
 		<param name="number_of_L2Directories" value="0"/>
 		<param name="number_of_L2s" value="4"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
 		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
 		<param name="number_of_NoCs" value="1"/>
 		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
 		<param name="homogeneous_L2s" value="1"/>
 		<param name="homogeneous_L1Directorys" value="1"/>
 		<param name="homogeneous_L2Directorys" value="1"/>
 		<param name="homogeneous_L3s" value="1"/>
 		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
 		<param name="homogeneous_NoCs" value="1"/>
 		<param name="core_tech_node" value="90"/><!-- nm -->
 		<param name="target_core_clockrate" value="1200"/><!--MHz -->
 		<param name="temperature" value="380"/> <!-- Kelvin -->
 		<param name="number_cache_levels" value="2"/>
 		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
 		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
 		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
 		<param name="machine_bits" value="64"/>
 		<param name="virtual_address_width" value="64"/>
 		<param name="physical_address_width" value="52"/>
 		<param name="virtual_memory_page_size" value="4096"/>
 		<stat name="total_cycles" value="100000"/>
 		<stat name="idle_cycles" value="0"/>
 		<stat name="busy_cycles"  value="100000"/>
 			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
 			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
 		<!-- *********************** cores ******************* -->
 		<component id="system.core0" name="core0">
 			<!-- Core property -->
 			<param name="clock_rate" value="1200"/>
 			<param name="instruction_length" value="32"/>
 			<param name="opcode_width" value="9"/>
 			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
 			default value is machine_bits, if not set --> 
 			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
 			<!-- inorder/OoO -->
 			<param name="number_hardware_threads" value="4"/>
 			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
 			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
 			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
 			<param name="fetch_width" value="1"/>
 			<!-- fetch_width determins the size of cachelines of L1 cache block -->
 			<param name="number_instruction_fetch_ports" value="1"/>
 			<param name="decode_width" value="1"/>
 			<!-- decode_width determins the number of ports of the 
 			renaming table (both RAM and CAM) scheme -->
 			<param name="issue_width" value="1"/>
 			<!-- issue_width determins the number of ports of Issue window and other logic 
 			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
 			<param name="commit_width" value="1"/>
 			<!-- commit_width determins the number of ports of register files -->
 			<param name="fp_issue_width" value="1"/>
 			<param name="prediction_width" value="0"/> 
 			<!-- number of branch instructions can be predicted simultannouesl-->
 			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
 			Theses parameters are reserved for future use.--> 
 			<param name="pipelines_per_core" value="1,1"/>
 			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
 			<param name="pipeline_depth" value="6,6"/>
 			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
 			<!-- issue and exe unit-->
 			<param name="ALU_per_core" value="1"/>
 			<!-- contains an adder, a shifter, and a logical unit -->
 			<param name="MUL_per_core" value="1"/>
 			<!-- For MUL and Div -->
 			<param name="FPU_per_core" value="0.125"/>		
 			<!-- buffer between IF and ID stage -->
 			<param name="instruction_buffer_size" value="16"/>
 			<!-- buffer between ID and sche/exe stage -->
 			<param name="decoded_stream_buffer_size" value="16"/>
 			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
 			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
 			<param name="instruction_window_size" value="16"/>
 			<param name="fp_instruction_window_size" value="16"/>
 			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
 			<param name="ROB_size" value="80"/>
 			<!-- each in-flight instruction has an entry in ROB -->
 			<!-- registers -->
 			<param name="archi_Regs_IRF_size" value="32"/>			
 			<param name="archi_Regs_FRF_size" value="32"/>
 			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
 			renaming logic is for both integer and floating point insts.  -->
 			<param name="phy_Regs_IRF_size" value="80"/>
 			<param name="phy_Regs_FRF_size" value="80"/>
 			<!-- rename logic -->
 			<param name="rename_scheme" value="0"/>
 			<!-- can be RAM based(0) or CAM based(1) rename scheme 
 			RAM-based scheme will have free list, status table;
 			CAM-based scheme have the valid bit in the data field of the CAM 
 			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
 			Detailed RAT Implementation see TR -->
 			<param name="register_windows_size" value="8"/>
 			<!-- how many windows in the windowed register file, sun processors;
 			no register windowing is used when this number is 0 -->
 			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
 			They will always try to exeute out-of-order though. -->
 			<param name="LSU_order" value="inorder"/>
 			<param name="store_buffer_size" value="32"/>
 			<!-- By default, in-order cores do not have load buffers -->
 			<param name="load_buffer_size" value="32"/>	
 			<!-- number of ports refer to sustainable concurrent memory accesses --> 
 			<param name="memory_ports" value="1"/>	
 			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
 			as well as the ports of Dcache which is connected to LSU -->	
 			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
 			<param name="RAS_size" value="32"/>						
 			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
 			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
 			<stat name="total_instructions" value="800000"/>
 			<stat name="int_instructions" value="600000"/>
 			<stat name="fp_instructions" value="20000"/>
 			<stat name="branch_instructions" value="0"/>
 			<stat name="branch_mispredictions" value="0"/>
 			<stat name="load_instructions" value="100000"/>
 			<stat name="store_instructions" value="100000"/>
 			<stat name="committed_instructions" value="800000"/>
 			<stat name="committed_int_instructions" value="600000"/>
 			<stat name="committed_fp_instructions" value="20000"/>
 			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
 			<!-- the following cycle stats are used for heterogeneouse cores only, 
 				please ignore them if homogeneouse cores -->
 			<stat name="total_cycles" value="100000"/>
 		    <stat name="idle_cycles" value="0"/>
 		    <stat name="busy_cycles"  value="100000"/>
 			<!-- instruction buffer stats -->
 			<!-- ROB stats, both RS and Phy based OoOs have ROB
 			performance simulator should capture the difference on accesses,
 			otherwise, McPAT has to guess based on number of commited instructions. -->
 			<stat name="ROB_reads" value="263886"/>
 			<stat name="ROB_writes" value="263886"/>
 			<!-- RAT accesses -->
 			<stat name="rename_accesses" value="263886"/>
 			<stat name="fp_rename_accesses" value="263886"/>
 			<!-- decode and rename stage use this, should be total ic - nop -->
 			<!-- Inst window stats -->
 			<stat name="inst_window_reads" value="263886"/>
 			<stat name="inst_window_writes" value="263886"/>
 			<stat name="inst_window_wakeup_accesses" value="263886"/>
 			<stat name="fp_inst_window_reads" value="263886"/>
 			<stat name="fp_inst_window_writes" value="263886"/>
 			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
 			<!--  RF accesses -->
 			<stat name="int_regfile_reads" value="1600000"/>
 			<stat name="float_regfile_reads" value="40000"/>
 			<stat name="int_regfile_writes" value="800000"/>
 			<stat name="float_regfile_writes" value="20000"/>
 			<!-- accesses to the working reg -->
 			<stat name="function_calls" value="5"/>
 			<stat name="context_switches" value="260343"/>
 			<!-- Number of Windowes switches (number of function calls and returns)-->
 			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
 			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
 			<stat name="ialu_accesses" value="800000"/>			
 			<stat name="fpu_accesses" value="10000"/>
 			<stat name="mul_accesses" value="100000"/>
 			<stat name="cdb_alu_accesses" value="1000000"/>
 			<stat name="cdb_mul_accesses" value="0"/>
 			<stat name="cdb_fpu_accesses" value="0"/>
 			<!-- multiple cycle accesses should be counted multiple times, 
 			otherwise, McPAT can use internal counter for different floating point instructions 
 			to get final accesses. But that needs detailed info for floating point inst mix -->
 			<!--  currently the performance simulator should 
 			make sure all the numbers are final numbers, 
 			including the explicit read/write accesses, 
 			and the implicite accesses such as replacements and etc.
 			Future versions of McPAT may be able to reason the implicite access
 			based on param and stats of last level cache
 			The same rule applies to all cache access stats too!  -->
 			<!-- following is AF for max power computation. 
 				Do not change them, unless you understand them-->
 			<stat name="IFU_duty_cycle" value="0.25"/>			
 			<stat name="LSU_duty_cycle" value="0.25"/>
 			<stat name="MemManU_I_duty_cycle" value="1"/>
 			<stat name="MemManU_D_duty_cycle" value="0.25"/>
 			<stat name="ALU_duty_cycle" value="0.9"/>
 			<stat name="MUL_duty_cycle" value="0.5"/>
 			<stat name="FPU_duty_cycle" value="0.4"/>
 			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
 			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
 			<stat name="FPU_cdb_duty_cycle" value="0.4"/>
 			<component id="system.core0.predictor" name="PBT">
 				<!-- branch predictor; tournament predictor see Alpha implementation -->
 				<param name="local_predictor_size" value="10,3"/>
 				<param name="local_predictor_entries" value="1024"/>
 				<param name="global_predictor_entries" value="4096"/>
 				<param name="global_predictor_bits" value="2"/>
 				<param name="chooser_predictor_entries" value="4096"/>
 				<param name="chooser_predictor_bits" value="2"/>
 				<!-- These parameters can be combined like below in next version
 				<param name="load_predictor" value="10,3,1024"/>
 				<param name="global_predictor" value="4096,2"/>
 				<param name="predictor_chooser" value="4096,2"/>
 				-->
 			</component>
 			<component id="system.core0.itlb" name="itlb">
 				<param name="number_entries" value="64"/>
 				<stat name="total_accesses" value="800000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
 				which is actually a replacement -->
 			</component>
 			<component id="system.core0.icache" name="icache">
 				<!-- there is no write requests to itlb although writes happen to it after miss, 
 				which is actually a replacement -->
 				<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
 				<param name="buffer_sizes" value="16, 16, 16,0"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
 				<stat name="read_accesses" value="200000"/>
 				<stat name="read_misses" value="0"/>
 				<stat name="conflicts" value="0"/>				
 			</component>
 			<component id="system.core0.dtlb" name="dtlb">
 				<param name="number_entries" value="64"/>
 				<stat name="total_accesses" value="200000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<component id="system.core0.dcache" name="dcache">
 			        <!-- all the buffer related are optional -->
 				<param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="200000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<component id="system.core0.BTB" name="BTB">
 			        <!-- all the buffer related are optional -->
 				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
 				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			</component>
 	</component>
 		<component id="system.L1Directory0" name="L1Directory0">
 				<param name="Directory_type" value="0"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="1200"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="800000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="20"/>	
 				<stat name="duty_cycle" value="0.45"/>	
 		</component>
 		<component id="system.L2Directory0" name="L2Directory0">
 				<param name="Directory_type" value="1"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="1200"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="58824"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="100"/>
 			    <stat name="duty_cycle" value="0.45"/>		
 		</component>
 		<component id="system.L20" name="L20">
 			<!-- all the buffer related are optional -->
 				<param name="L2_config" value="786432,64,16,1, 4,23, 64, 1"/>
 			    <!-- consider 4-way bank interleaving for Niagara 1 -->
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<param name="clockrate" value="1200"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<stat name="read_accesses" value="200000"/>
 				<stat name="write_accesses" value="0"/>
 				<stat name="read_misses" value="0"/>
 				<stat name="write_misses" value="0"/>
 				<stat name="conflicts" value="0"/>	
 			    <stat name="duty_cycle" value="0.5"/>	
 		</component>
 <!--**********************************************************************-->
 <component id="system.L30" name="L30">
 				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<param name="clockrate" value="3500"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="58824"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 	            <stat name="duty_cycle" value="0.35"/>	
 		</component>
 <!--**********************************************************************-->
 		<component id="system.NoC0" name="noc0">
 			<param name="clockrate" value="1200"/>
 			<param name="type" value="1"/>
 			<!-- 1 NoC, O bus -->
 			<param name="horizontal_nodes" value="2"/>
 			<param name="vertical_nodes" value="1"/>
 			<param name="has_global_link" value="0"/>
 			<!-- 1 has global link, 0 does not have global link -->
 			<param name="link_throughput" value="1"/><!--w.r.t clock -->
 			<param name="link_latency" value="1"/><!--w.r.t clock -->
 			<!-- througput >= latency -->
 			<!-- Router architecture -->
 			<param name="input_ports" value="8"/>
 			<param name="output_ports" value="5"/>
 			<param name="virtual_channel_per_port" value="1"/>
 			<!-- input buffer; in classic routers only input ports need buffers -->
 			<param name="flit_bits" value="136"/>
 			<param name="input_buffer_entries_per_vc" value="2"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
 			<param name="chip_coverage" value="1"/>
 			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
 			<stat name="total_accesses" value="360000"/>
 			<!-- This is the number of total accesses within the whole network not for each router -->
 			<stat name="duty_cycle" value="0.6"/>
 		</component>
 <!--**********************************************************************-->
 		<component id="system.mem" name="mem">
 			<!-- Main memory property -->
 			<param name="mem_tech_node" value="32"/>
 			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
 			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
 			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
 			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
 			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
 			<!-- above numbers can be easily found from Wikipedia -->
 			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
 			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
 			Current McPAT assumes single DIMMs are used.--> 		
 			<param name="number_ranks" value="2"/>
 			<param name="num_banks_of_DRAM_chip" value="8"/>			
 			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
 			<param name="output_width_of_DRAM_chip" value="8"/>
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
 			<param name="burstlength_of_DRAM_chip" value="8"/>
 			<stat name="memory_accesses" value="1052"/>
 			<stat name="memory_reads" value="1052"/>
 			<stat name="memory_writes" value="1052"/>									
 		</component>
 		<component id="system.mc" name="mc">
 			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
 			<!-- current version of McPAT uses published values for base parameters of memory controller
 			improvments on MC will be added in later versions. -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
 			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
 			<param name="block_size" value="64"/><!--B-->
 			<param name="number_mcs" value="4"/>
 			<!-- current McPAT only supports homogeneous memory controllers -->
 			<param name="memory_channels_per_mc" value="1"/>
 			<param name="number_ranks" value="2"/>
 			<param name="withPHY" value="0"/>
 			<!-- # of ranks of each channel-->
 			<param name="req_window_size_per_channel" value="32"/>
 			<param name="IO_buffer_size_per_channel" value="32"/>
 			<param name="databus_width" value="128"/>
 			<param name="addressbus_width" value="51"/>
 			<!-- McPAT will add the control bus width to the addressbus width automatically -->
 			<stat name="memory_accesses" value="33333"/>
 			<stat name="memory_reads" value="16667"/>
 			<stat name="memory_writes" value="16667"/>
 			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
 			the average power per MC or per channel. This is sufficent for most application. 
 			Further trackdown can be easily added in later versions. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.niu" name="niu">
 			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
 			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
 				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
 			the average power per nic or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.pcie" name="pcie">
 			<!-- On chip PCIe controller, including Phy-->
 			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
 				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="withPHY" value="1"/>
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="0"/>
 			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
 			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.flashc" name="flashc">
 		    <param name="number_flashcs" value="0"/>
 			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
            <param name="withPHY" value="1"/>
 			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
 			the average power per fc or per channel. This is sufficent for most application -->  			
 		</component>
 <!--**********************************************************************-->
 		</component>
 </component>
--- a/ext/mcpat/Niagara1_sharing.xml
+++ b/ext/mcpat/Niagara1_sharing.xml
@ -0,0 +1,400 @@
 <?xml version="1.0" ?>
 <component id="root" name="root">
 	<component id="system" name="system">
 		<!--McPAT will skip the components if number is set to 0 -->
 		<param name="number_of_cores" value="64"/>
 		<param name="number_of_L1Directories" value="0"/>
 		<param name="number_of_L2Directories" value="0"/>
 		<param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
 		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
 		<param name="number_of_NoCs" value="1"/>
 		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
 		<param name="homogeneous_L2s" value="1"/>
 		<param name="homogeneous_L1Directorys" value="1"/>
 		<param name="homogeneous_L2Directorys" value="1"/>
 		<param name="homogeneous_L3s" value="1"/>
 		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
 		<param name="homogeneous_NoCs" value="1"/>
 		<param name="core_tech_node" value="22"/><!-- nm -->
 		<param name="target_core_clockrate" value="3500"/><!--MHz -->
 		<param name="temperature" value="360"/> <!-- Kelvin -->
 		<param name="number_cache_levels" value="2"/>
 		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
 		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
 		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
 		<param name="machine_bits" value="64"/>
 		<param name="virtual_address_width" value="64"/>
 		<param name="physical_address_width" value="52"/>
 		<param name="virtual_memory_page_size" value="4096"/>
 		<stat name="total_cycles" value="100000"/>
 		<stat name="idle_cycles" value="0"/>
 		<stat name="busy_cycles"  value="100000"/>
 			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
 			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
 		<!-- *********************** cores ******************* -->
 		<component id="system.core0" name="core0">
 			<!-- Core property -->
 			<param name="clock_rate" value="3500"/>
 			<param name="instruction_length" value="32"/>
 			<param name="opcode_width" value="9"/>
 			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
 			default value is machine_bits, if not set --> 
 			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
 			<!-- inorder/OoO -->
 			<param name="number_hardware_threads" value="4"/>
 			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
 			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
 			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
 			<param name="fetch_width" value="1"/>
 			<!-- fetch_width determins the size of cachelines of L1 cache block -->
 			<param name="number_instruction_fetch_ports" value="1"/>
 			<param name="decode_width" value="1"/>
 			<!-- decode_width determins the number of ports of the 
 			renaming table (both RAM and CAM) scheme -->
 			<param name="issue_width" value="1"/>
 			<!-- issue_width determins the number of ports of Issue window and other logic 
 			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
 			<param name="commit_width" value="1"/>
 			<!-- commit_width determins the number of ports of register files -->
 			<param name="fp_issue_width" value="1"/>
 			<param name="prediction_width" value="0"/> 
 			<!-- number of branch instructions can be predicted simultannouesl-->
 			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
 			Theses parameters are reserved for future use.--> 
 			<param name="pipelines_per_core" value="1,1"/>
 			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
 			<param name="pipeline_depth" value="6,6"/>
 			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
 			<!-- issue and exe unit-->
 			<param name="ALU_per_core" value="1"/>
 			<!-- contains an adder, a shifter, and a logical unit -->
 			<param name="MUL_per_core" value="1"/>
 			<!-- For MUL and Div -->
 			<param name="FPU_per_core" value="0.125"/>		
 			<!-- buffer between IF and ID stage -->
 			<param name="instruction_buffer_size" value="16"/>
 			<!-- buffer between ID and sche/exe stage -->
 			<param name="decoded_stream_buffer_size" value="16"/>
 			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
 			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
 			<param name="instruction_window_size" value="16"/>
 			<param name="fp_instruction_window_size" value="16"/>
 			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
 			<param name="ROB_size" value="80"/>
 			<!-- each in-flight instruction has an entry in ROB -->
 			<!-- registers -->
 			<param name="archi_Regs_IRF_size" value="32"/>			
 			<param name="archi_Regs_FRF_size" value="32"/>
 			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
 			renaming logic is for both integer and floating point insts.  -->
 			<param name="phy_Regs_IRF_size" value="80"/>
 			<param name="phy_Regs_FRF_size" value="80"/>
 			<!-- rename logic -->
 			<param name="rename_scheme" value="0"/>
 			<!-- can be RAM based(0) or CAM based(1) rename scheme 
 			RAM-based scheme will have free list, status table;
 			CAM-based scheme have the valid bit in the data field of the CAM 
 			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
 			Detailed RAT Implementation see TR -->
 			<param name="register_windows_size" value="8"/>
 			<!-- how many windows in the windowed register file, sun processors;
 			no register windowing is used when this number is 0 -->
 			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
 			They will always try to exeute out-of-order though. -->
 			<param name="LSU_order" value="inorder"/>
 			<param name="store_buffer_size" value="32"/>
 			<!-- By default, in-order cores do not have load buffers -->
 			<param name="load_buffer_size" value="32"/>	
 			<!-- number of ports refer to sustainable concurrent memory accesses --> 
 			<param name="memory_ports" value="1"/>	
 			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
 			as well as the ports of Dcache which is connected to LSU -->	
 			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
 			<param name="RAS_size" value="32"/>						
 			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
 			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
 			<stat name="total_instructions" value="800000"/>
 			<stat name="int_instructions" value="600000"/>
 			<stat name="fp_instructions" value="20000"/>
 			<stat name="branch_instructions" value="0"/>
 			<stat name="branch_mispredictions" value="0"/>
 			<stat name="load_instructions" value="100000"/>
 			<stat name="store_instructions" value="100000"/>
 			<stat name="committed_instructions" value="800000"/>
 			<stat name="committed_int_instructions" value="600000"/>
 			<stat name="committed_fp_instructions" value="20000"/>
 			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
 			<!-- the following cycle stats are used for heterogeneouse cores only, 
 				please ignore them if homogeneouse cores -->
 			<stat name="total_cycles" value="100000"/>
 		    <stat name="idle_cycles" value="0"/>
 		    <stat name="busy_cycles"  value="100000"/>
 			<!-- instruction buffer stats -->
 			<!-- ROB stats, both RS and Phy based OoOs have ROB
 			performance simulator should capture the difference on accesses,
 			otherwise, McPAT has to guess based on number of commited instructions. -->
 			<stat name="ROB_reads" value="263886"/>
 			<stat name="ROB_writes" value="263886"/>
 			<!-- RAT accesses -->
 			<stat name="rename_accesses" value="263886"/>
 			<stat name="fp_rename_accesses" value="263886"/>
 			<!-- decode and rename stage use this, should be total ic - nop -->
 			<!-- Inst window stats -->
 			<stat name="inst_window_reads" value="263886"/>
 			<stat name="inst_window_writes" value="263886"/>
 			<stat name="inst_window_wakeup_accesses" value="263886"/>
 			<stat name="fp_inst_window_reads" value="263886"/>
 			<stat name="fp_inst_window_writes" value="263886"/>
 			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
 			<!--  RF accesses -->
 			<stat name="int_regfile_reads" value="1600000"/>
 			<stat name="float_regfile_reads" value="40000"/>
 			<stat name="int_regfile_writes" value="800000"/>
 			<stat name="float_regfile_writes" value="20000"/>
 			<!-- accesses to the working reg -->
 			<stat name="function_calls" value="5"/>
 			<stat name="context_switches" value="260343"/>
 			<!-- Number of Windowes switches (number of function calls and returns)-->
 			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
 			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
 			<stat name="ialu_accesses" value="800000"/>			
 			<stat name="fpu_accesses" value="10000"/>
 			<stat name="mul_accesses" value="100000"/>
 			<stat name="cdb_alu_accesses" value="1000000"/>
 			<stat name="cdb_mul_accesses" value="0"/>
 			<stat name="cdb_fpu_accesses" value="0"/>
 			<!-- multiple cycle accesses should be counted multiple times, 
 			otherwise, McPAT can use internal counter for different floating point instructions 
 			to get final accesses. But that needs detailed info for floating point inst mix -->
 			<!--  currently the performance simulator should 
 			make sure all the numbers are final numbers, 
 			including the explicit read/write accesses, 
 			and the implicite accesses such as replacements and etc.
 			Future versions of McPAT may be able to reason the implicite access
 			based on param and stats of last level cache
 			The same rule applies to all cache access stats too!  -->
 			<!-- following is AF for max power computation. 
 				Do not change them, unless you understand them-->
 			<stat name="IFU_duty_cycle" value="0.25"/>			
 			<stat name="LSU_duty_cycle" value="0.25"/>
 			<stat name="MemManU_I_duty_cycle" value="1"/>
 			<stat name="MemManU_D_duty_cycle" value="0.25"/>
 			<stat name="ALU_duty_cycle" value="0.9"/>
 			<stat name="MUL_duty_cycle" value="0.5"/>
 			<stat name="FPU_duty_cycle" value="0.4"/>
 			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
 			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
 			<stat name="FPU_cdb_duty_cycle" value="0.4"/>
 			<component id="system.core0.predictor" name="PBT">
 				<!-- branch predictor; tournament predictor see Alpha implementation -->
 				<param name="local_predictor_size" value="10,3"/>
 				<param name="local_predictor_entries" value="1024"/>
 				<param name="global_predictor_entries" value="4096"/>
 				<param name="global_predictor_bits" value="2"/>
 				<param name="chooser_predictor_entries" value="4096"/>
 				<param name="chooser_predictor_bits" value="2"/>
 				<!-- These parameters can be combined like below in next version
 				<param name="load_predictor" value="10,3,1024"/>
 				<param name="global_predictor" value="4096,2"/>
 				<param name="predictor_chooser" value="4096,2"/>
 				-->
 			</component>
 			<component id="system.core0.itlb" name="itlb">
 				<param name="number_entries" value="64"/>
 				<stat name="total_accesses" value="800000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
 				which is actually a replacement -->
 			</component>
 			<component id="system.core0.icache" name="icache">
 				<!-- there is no write requests to itlb although writes happen to it after miss, 
 				which is actually a replacement -->
 				<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
 				<param name="buffer_sizes" value="16, 16, 16,0"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
 				<stat name="read_accesses" value="200000"/>
 				<stat name="read_misses" value="0"/>
 				<stat name="conflicts" value="0"/>				
 			</component>
 			<component id="system.core0.dtlb" name="dtlb">
 				<param name="number_entries" value="64"/>
 				<stat name="total_accesses" value="200000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<component id="system.core0.dcache" name="dcache">
 			        <!-- all the buffer related are optional -->
 				<param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="200000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<component id="system.core0.BTB" name="BTB">
 			        <!-- all the buffer related are optional -->
 				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
 				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			</component>
 	</component>
 		<component id="system.L1Directory0" name="L1Directory0">
 				<param name="Directory_type" value="0"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="3500"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="800000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="20"/>	
 				<stat name="duty_cycle" value="0.45"/>	
 		</component>
 		<component id="system.L2Directory0" name="L2Directory0">
 				<param name="Directory_type" value="1"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="3500"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="58824"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="100"/>
 			    <stat name="duty_cycle" value="0.45"/>		
 		</component>
 		<component id="system.L20" name="L20">
 			<!-- all the buffer related are optional -->
 				<param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/>
 			    <!-- consider 4-way bank interleaving for Niagara 1 -->
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<param name="clockrate" value="3500"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<stat name="read_accesses" value="200000"/>
 				<stat name="write_accesses" value="0"/>
 				<stat name="read_misses" value="0"/>
 				<stat name="write_misses" value="0"/>
 				<stat name="conflicts" value="0"/>	
 			    <stat name="duty_cycle" value="0.5"/>	
 		</component>
 <!--**********************************************************************-->
 <component id="system.L30" name="L30">
 				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<param name="clockrate" value="3500"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="58824"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 	            <stat name="duty_cycle" value="0.35"/>	
 		</component>
 <!--**********************************************************************-->
 		<component id="system.NoC0" name="noc0">
 			<param name="clockrate" value="3500"/>
 			<param name="type" value="1"/>
 			<!-- 1 NoC, O bus -->
 			<param name="horizontal_nodes" value="8"/>
 			<param name="vertical_nodes" value="8"/>
 			<param name="has_global_link" value="1"/>
 			<!-- 1 has global link, 0 does not have global link -->
 			<param name="link_throughput" value="1"/><!--w.r.t clock -->
 			<param name="link_latency" value="1"/><!--w.r.t clock -->
 			<!-- througput >= latency -->
 			<!-- Router architecture -->
 			<param name="input_ports" value="5"/>
 			<param name="output_ports" value="5"/>
 			<param name="virtual_channel_per_port" value="1"/>
 			<!-- input buffer; in classic routers only input ports need buffers -->
 			<param name="flit_bits" value="256"/>
 			<param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
 			<param name="chip_coverage" value="1"/>
 			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
 			<stat name="total_accesses" value="360000"/>
 			<!-- This is the number of total accesses within the whole network not for each router -->
 			<stat name="duty_cycle" value="0.1"/>
 		</component>
 <!--**********************************************************************-->
 		<component id="system.mem" name="mem">
 			<!-- Main memory property -->
 			<param name="mem_tech_node" value="32"/>
 			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
 			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
 			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
 			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
 			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
 			<!-- above numbers can be easily found from Wikipedia -->
 			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
 			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
 			Current McPAT assumes single DIMMs are used.--> 		
 			<param name="number_ranks" value="2"/>
 			<param name="num_banks_of_DRAM_chip" value="8"/>			
 			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
 			<param name="output_width_of_DRAM_chip" value="8"/>
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
 			<param name="burstlength_of_DRAM_chip" value="8"/>
 			<stat name="memory_accesses" value="1052"/>
 			<stat name="memory_reads" value="1052"/>
 			<stat name="memory_writes" value="1052"/>									
 		</component>
 		<component id="system.mc" name="mc">
 			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
 			<!-- current version of McPAT uses published values for base parameters of memory controller
 			improvments on MC will be added in later versions. -->
 			<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
 			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
 			<param name="llc_line_length" value="64"/><!--B-->
 			<param name="number_mcs" value="4"/>
 			<!-- current McPAT only supports homogeneous memory controllers -->
 			<param name="memory_channels_per_mc" value="1"/>
 			<param name="number_ranks" value="2"/>
 			<!-- # of ranks of each channel-->
 			<param name="req_window_size_per_channel" value="32"/>
 			<param name="IO_buffer_size_per_channel" value="32"/>
 			<param name="databus_width" value="128"/>
 			<param name="addressbus_width" value="51"/>
 			<!-- McPAT will add the control bus width to the addressbus width automatically -->
 			<stat name="memory_accesses" value="33333"/>
 			<stat name="memory_reads" value="16667"/>
 			<stat name="memory_writes" value="16667"/>
 			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
 			the average power per MC or per channel. This is sufficent for most application. 
 			Further trackdown can be easily added in later versions. -->  			
 		</component>
 <!--**********************************************************************-->
 	</component>
 </component>
--- a/ext/mcpat/Niagara1_sharing_DC.xml
+++ b/ext/mcpat/Niagara1_sharing_DC.xml
@ -0,0 +1,442 @@
 <?xml version="1.0" ?>
 <component id="root" name="root">
 	<component id="system" name="system">
 		<!--McPAT will skip the components if number is set to 0 -->
 		<param name="number_of_cores" value="64"/>
 		<param name="number_of_L1Directories" value="0"/>
 		<param name="number_of_L2Directories" value="8"/>
 		<param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
 		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
 		<param name="number_of_NoCs" value="1"/>
 		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
 		<param name="homogeneous_L2s" value="1"/>
 		<param name="homogeneous_L1Directorys" value="1"/>
 		<param name="homogeneous_L2Directorys" value="1"/>
 		<param name="homogeneous_L3s" value="1"/>
 		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
 		<param name="homogeneous_NoCs" value="1"/>
 		<param name="core_tech_node" value="22"/><!-- nm -->
 		<param name="target_core_clockrate" value="3500"/><!--MHz -->
 		<param name="temperature" value="360"/> <!-- Kelvin -->
 		<param name="number_cache_levels" value="2"/>
 		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
 		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
 		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
 		<param name="machine_bits" value="64"/>
 		<param name="virtual_address_width" value="64"/>
 		<param name="physical_address_width" value="52"/>
 		<param name="virtual_memory_page_size" value="4096"/>
 		<stat name="total_cycles" value="100000"/>
 		<stat name="idle_cycles" value="0"/>
 		<stat name="busy_cycles"  value="100000"/>
 			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
 			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
 		<!-- *********************** cores ******************* -->
 		<component id="system.core0" name="core0">
 			<!-- Core property -->
 			<param name="clock_rate" value="3500"/>
 			<param name="instruction_length" value="32"/>
 			<param name="opcode_width" value="9"/>
 			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
 			default value is machine_bits, if not set --> 
 			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
 			<!-- inorder/OoO -->
 			<param name="number_hardware_threads" value="4"/>
 			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
 			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
 			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
 			<param name="fetch_width" value="1"/>
 			<!-- fetch_width determins the size of cachelines of L1 cache block -->
 			<param name="number_instruction_fetch_ports" value="1"/>
 			<param name="decode_width" value="1"/>
 			<!-- decode_width determins the number of ports of the 
 			renaming table (both RAM and CAM) scheme -->
 			<param name="issue_width" value="1"/>
 			<!-- issue_width determins the number of ports of Issue window and other logic 
 			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
 			<param name="commit_width" value="1"/>
 			<!-- commit_width determins the number of ports of register files -->
 			<param name="fp_issue_width" value="1"/>
 			<param name="prediction_width" value="0"/> 
 			<!-- number of branch instructions can be predicted simultannouesl-->
 			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
 			Theses parameters are reserved for future use.--> 
 			<param name="pipelines_per_core" value="1,1"/>
 			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
 			<param name="pipeline_depth" value="6,6"/>
 			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
 			<!-- issue and exe unit-->
 			<param name="ALU_per_core" value="1"/>
 			<!-- contains an adder, a shifter, and a logical unit -->
 			<param name="MUL_per_core" value="1"/>
 			<!-- For MUL and Div -->
 			<param name="FPU_per_core" value="0.125"/>		
 			<!-- buffer between IF and ID stage -->
 			<param name="instruction_buffer_size" value="16"/>
 			<!-- buffer between ID and sche/exe stage -->
 			<param name="decoded_stream_buffer_size" value="16"/>
 			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
 			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
 			<param name="instruction_window_size" value="16"/>
 			<param name="fp_instruction_window_size" value="16"/>
 			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
 			<param name="ROB_size" value="80"/>
 			<!-- each in-flight instruction has an entry in ROB -->
 			<!-- registers -->
 			<param name="archi_Regs_IRF_size" value="32"/>			
 			<param name="archi_Regs_FRF_size" value="32"/>
 			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
 			renaming logic is for both integer and floating point insts.  -->
 			<param name="phy_Regs_IRF_size" value="80"/>
 			<param name="phy_Regs_FRF_size" value="80"/>
 			<!-- rename logic -->
 			<param name="rename_scheme" value="0"/>
 			<!-- can be RAM based(0) or CAM based(1) rename scheme 
 			RAM-based scheme will have free list, status table;
 			CAM-based scheme have the valid bit in the data field of the CAM 
 			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
 			Detailed RAT Implementation see TR -->
 			<param name="register_windows_size" value="8"/>
 			<!-- how many windows in the windowed register file, sun processors;
 			no register windowing is used when this number is 0 -->
 			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
 			They will always try to exeute out-of-order though. -->
 			<param name="LSU_order" value="inorder"/>
 			<param name="store_buffer_size" value="32"/>
 			<!-- By default, in-order cores do not have load buffers -->
 			<param name="load_buffer_size" value="32"/>	
 			<!-- number of ports refer to sustainable concurrent memory accesses --> 
 			<param name="memory_ports" value="1"/>	
 			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
 			as well as the ports of Dcache which is connected to LSU -->	
 			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
 			<param name="RAS_size" value="32"/>						
 			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
 			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
 			<stat name="total_instructions" value="800000"/>
 			<stat name="int_instructions" value="600000"/>
 			<stat name="fp_instructions" value="20000"/>
 			<stat name="branch_instructions" value="0"/>
 			<stat name="branch_mispredictions" value="0"/>
 			<stat name="load_instructions" value="100000"/>
 			<stat name="store_instructions" value="100000"/>
 			<stat name="committed_instructions" value="800000"/>
 			<stat name="committed_int_instructions" value="600000"/>
 			<stat name="committed_fp_instructions" value="20000"/>
 			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
 			<!-- the following cycle stats are used for heterogeneouse cores only, 
 				please ignore them if homogeneouse cores -->
 			<stat name="total_cycles" value="100000"/>
 		    <stat name="idle_cycles" value="0"/>
 		    <stat name="busy_cycles"  value="100000"/>
 			<!-- instruction buffer stats -->
 			<!-- ROB stats, both RS and Phy based OoOs have ROB
 			performance simulator should capture the difference on accesses,
 			otherwise, McPAT has to guess based on number of commited instructions. -->
 			<stat name="ROB_reads" value="263886"/>
 			<stat name="ROB_writes" value="263886"/>
 			<!-- RAT accesses -->
 			<stat name="rename_accesses" value="263886"/>
 			<stat name="fp_rename_accesses" value="263886"/>
 			<!-- decode and rename stage use this, should be total ic - nop -->
 			<!-- Inst window stats -->
 			<stat name="inst_window_reads" value="263886"/>
 			<stat name="inst_window_writes" value="263886"/>
 			<stat name="inst_window_wakeup_accesses" value="263886"/>
 			<stat name="fp_inst_window_reads" value="263886"/>
 			<stat name="fp_inst_window_writes" value="263886"/>
 			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
 			<!--  RF accesses -->
 			<stat name="int_regfile_reads" value="1600000"/>
 			<stat name="float_regfile_reads" value="40000"/>
 			<stat name="int_regfile_writes" value="800000"/>
 			<stat name="float_regfile_writes" value="20000"/>
 			<!-- accesses to the working reg -->
 			<stat name="function_calls" value="5"/>
 			<stat name="context_switches" value="260343"/>
 			<!-- Number of Windowes switches (number of function calls and returns)-->
 			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
 			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
 			<stat name="ialu_accesses" value="800000"/>			
 			<stat name="fpu_accesses" value="10000"/>
 			<stat name="mul_accesses" value="100000"/>
 			<stat name="cdb_alu_accesses" value="1000000"/>
 			<stat name="cdb_mul_accesses" value="0"/>
 			<stat name="cdb_fpu_accesses" value="0"/>
 			<!-- multiple cycle accesses should be counted multiple times, 
 			otherwise, McPAT can use internal counter for different floating point instructions 
 			to get final accesses. But that needs detailed info for floating point inst mix -->
 			<!--  currently the performance simulator should 
 			make sure all the numbers are final numbers, 
 			including the explicit read/write accesses, 
 			and the implicite accesses such as replacements and etc.
 			Future versions of McPAT may be able to reason the implicite access
 			based on param and stats of last level cache
 			The same rule applies to all cache access stats too!  -->
 			<!-- following is AF for max power computation. 
 				Do not change them, unless you understand them-->
 			<stat name="IFU_duty_cycle" value="0.25"/>			
 			<stat name="LSU_duty_cycle" value="0.25"/>
 			<stat name="MemManU_I_duty_cycle" value="1"/>
 			<stat name="MemManU_D_duty_cycle" value="0.25"/>
 			<stat name="ALU_duty_cycle" value="0.9"/>
 			<stat name="MUL_duty_cycle" value="0.5"/>
 			<stat name="FPU_duty_cycle" value="0.4"/>
 			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
 			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
 			<stat name="FPU_cdb_duty_cycle" value="0.4"/>
 			<component id="system.core0.predictor" name="PBT">
 				<!-- branch predictor; tournament predictor see Alpha implementation -->
 				<param name="local_predictor_size" value="10,3"/>
 				<param name="local_predictor_entries" value="1024"/>
 				<param name="global_predictor_entries" value="4096"/>
 				<param name="global_predictor_bits" value="2"/>
 				<param name="chooser_predictor_entries" value="4096"/>
 				<param name="chooser_predictor_bits" value="2"/>
 				<!-- These parameters can be combined like below in next version
 				<param name="load_predictor" value="10,3,1024"/>
 				<param name="global_predictor" value="4096,2"/>
 				<param name="predictor_chooser" value="4096,2"/>
 				-->
 			</component>
 			<component id="system.core0.itlb" name="itlb">
 				<param name="number_entries" value="64"/>
 				<stat name="total_accesses" value="800000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
 				which is actually a replacement -->
 			</component>
 			<component id="system.core0.icache" name="icache">
 				<!-- there is no write requests to itlb although writes happen to it after miss, 
 				which is actually a replacement -->
 				<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
 				<param name="buffer_sizes" value="16, 16, 16,0"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
 				<stat name="read_accesses" value="200000"/>
 				<stat name="read_misses" value="0"/>
 				<stat name="conflicts" value="0"/>				
 			</component>
 			<component id="system.core0.dtlb" name="dtlb">
 				<param name="number_entries" value="64"/>
 				<stat name="total_accesses" value="200000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<component id="system.core0.dcache" name="dcache">
 			        <!-- all the buffer related are optional -->
 				<param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="200000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<component id="system.core0.BTB" name="BTB">
 			        <!-- all the buffer related are optional -->
 				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
 				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			</component>
 	</component>
 		<component id="system.L1Directory0" name="L1Directory0">
 				<param name="Directory_type" value="0"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="3500"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="800000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="20"/>	
 				<stat name="duty_cycle" value="0.45"/>	
 		</component>
 		<component id="system.L2Directory0" name="L2Directory0">
 				<param name="Directory_type" value="1"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="1048576,9,16,1,2, 100"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="3500"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="58824"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="100"/>
 			    <stat name="duty_cycle" value="0.45"/>		
 		</component>
 		<component id="system.L20" name="L20">
 			<!-- all the buffer related are optional -->
 				<param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/>
 			    <!-- consider 4-way bank interleaving for Niagara 1 -->
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<param name="clockrate" value="3500"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<stat name="read_accesses" value="200000"/>
 				<stat name="write_accesses" value="0"/>
 				<stat name="read_misses" value="0"/>
 				<stat name="write_misses" value="0"/>
 				<stat name="conflicts" value="0"/>	
 			    <stat name="duty_cycle" value="0.5"/>	
 		</component>
 <!--**********************************************************************-->
 <component id="system.L30" name="L30">
 				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<param name="clockrate" value="3500"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="58824"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 	            <stat name="duty_cycle" value="0.35"/>	
 		</component>
 <!--**********************************************************************-->
 		<component id="system.NoC0" name="noc0">
 			<param name="clockrate" value="3500"/>
 			<param name="type" value="1"/>
 			<!-- 1 NoC, O bus -->
 			<param name="horizontal_nodes" value="8"/>
 			<param name="vertical_nodes" value="8"/>
 			<param name="has_global_link" value="1"/>
 			<!-- 1 has global link, 0 does not have global link -->
 			<param name="link_throughput" value="1"/><!--w.r.t clock -->
 			<param name="link_latency" value="1"/><!--w.r.t clock -->
 			<!-- througput >= latency -->
 			<!-- Router architecture -->
 			<param name="input_ports" value="5"/>
 			<param name="output_ports" value="5"/>
 			<param name="virtual_channel_per_port" value="1"/>
 			<!-- input buffer; in classic routers only input ports need buffers -->
 			<param name="flit_bits" value="256"/>
 			<param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
 			<param name="chip_coverage" value="1"/>
 			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
 			<stat name="total_accesses" value="360000"/>
 			<!-- This is the number of total accesses within the whole network not for each router -->
 			<stat name="duty_cycle" value="0.1"/>
 		</component>
 <!--**********************************************************************-->
 		<component id="system.mem" name="mem">
 			<!-- Main memory property -->
 			<param name="mem_tech_node" value="32"/>
 			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
 			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
 			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
 			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
 			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
 			<!-- above numbers can be easily found from Wikipedia -->
 			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
 			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
 			Current McPAT assumes single DIMMs are used.--> 		
 			<param name="number_ranks" value="2"/>
 			<param name="num_banks_of_DRAM_chip" value="8"/>			
 			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
 			<param name="output_width_of_DRAM_chip" value="8"/>
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
 			<param name="burstlength_of_DRAM_chip" value="8"/>
 			<stat name="memory_accesses" value="1052"/>
 			<stat name="memory_reads" value="1052"/>
 			<stat name="memory_writes" value="1052"/>									
 		</component>
 		<component id="system.mc" name="mc">
 			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
 			<!-- current version of McPAT uses published values for base parameters of memory controller
 			improvments on MC will be added in later versions. -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
 			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
 			<param name="block_size" value="64"/><!--B-->
 			<param name="number_mcs" value="0"/>
 			<!-- current McPAT only supports homogeneous memory controllers -->
 			<param name="memory_channels_per_mc" value="1"/>
 			<param name="number_ranks" value="2"/>
 			<param name="withPHY" value="0"/>
 			<!-- # of ranks of each channel-->
 			<param name="req_window_size_per_channel" value="32"/>
 			<param name="IO_buffer_size_per_channel" value="32"/>
 			<param name="databus_width" value="128"/>
 			<param name="addressbus_width" value="51"/>
 			<!-- McPAT will add the control bus width to the addressbus width automatically -->
 			<stat name="memory_accesses" value="33333"/>
 			<stat name="memory_reads" value="16667"/>
 			<stat name="memory_writes" value="16667"/>
 			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
 			the average power per MC or per channel. This is sufficent for most application. 
 			Further trackdown can be easily added in later versions. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.niu" name="niu">
 			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
 			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
 				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
 			the average power per nic or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.pcie" name="pcie">
 			<!-- On chip PCIe controller, including Phy-->
 			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
 				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="withPHY" value="1"/>
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="0"/>
 			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
 			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.flashc" name="flashc">
 		    <param name="number_flashcs" value="0"/>
 			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
            <param name="withPHY" value="1"/>
 			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
 			the average power per fc or per channel. This is sufficent for most application -->  			
 		</component>
 <!--**********************************************************************-->
 		</component>
 </component>
--- a/ext/mcpat/Niagara1_sharing_SBT.xml
+++ b/ext/mcpat/Niagara1_sharing_SBT.xml
@ -0,0 +1,455 @@
 <?xml version="1.0" ?>
 <component id="root" name="root">
 	<component id="system" name="system">
 		<!--McPAT will skip the components if number is set to 0 -->
 		<param name="number_of_cores" value="64"/>
 		<param name="number_of_L1Directories" value="0"/>
 		<param name="number_of_L2Directories" value="0"/>
 		<param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
 		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
 		<param name="number_of_NoCs" value="1"/>
 		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
 		<param name="homogeneous_L2s" value="1"/>
 		<param name="homogeneous_L1Directorys" value="1"/>
 		<param name="homogeneous_L2Directorys" value="1"/>
 		<param name="homogeneous_L3s" value="1"/>
 		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
 		<param name="homogeneous_NoCs" value="1"/>
 		<param name="core_tech_node" value="22"/><!-- nm -->
 		<param name="target_core_clockrate" value="3500"/><!--MHz -->
 		<param name="temperature" value="360"/> <!-- Kelvin -->
 		<param name="number_cache_levels" value="2"/>
 		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
 		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
 		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
 		<param name="machine_bits" value="64"/>
 		<param name="virtual_address_width" value="64"/>
 		<param name="physical_address_width" value="52"/>
 		<param name="virtual_memory_page_size" value="4096"/>
 		<stat name="total_cycles" value="100000"/>
 		<stat name="idle_cycles" value="0"/>
 		<stat name="busy_cycles"  value="100000"/>
 			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
 			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
 		<!-- *********************** cores ******************* -->
 		<component id="system.core0" name="core0">
 			<!-- Core property -->
 			<param name="clock_rate" value="3500"/>
 			<param name="instruction_length" value="32"/>
 			<param name="opcode_width" value="9"/>
 			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
 			default value is machine_bits, if not set --> 
 			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
 			<!-- inorder/OoO -->
 			<param name="number_hardware_threads" value="4"/>
 			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
 			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
 			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
 			<param name="fetch_width" value="1"/>
 			<!-- fetch_width determins the size of cachelines of L1 cache block -->
 			<param name="number_instruction_fetch_ports" value="1"/>
 			<param name="decode_width" value="1"/>
 			<!-- decode_width determins the number of ports of the 
 			renaming table (both RAM and CAM) scheme -->
 			<param name="issue_width" value="1"/>
 			<!-- issue_width determins the number of ports of Issue window and other logic 
 			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
 			<param name="commit_width" value="1"/>
 			<!-- commit_width determins the number of ports of register files -->
 			<param name="fp_issue_width" value="1"/>
 			<param name="prediction_width" value="0"/> 
 			<!-- number of branch instructions can be predicted simultannouesl-->
 			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
 			Theses parameters are reserved for future use.--> 
 			<param name="pipelines_per_core" value="1,1"/>
 			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
 			<param name="pipeline_depth" value="6,6"/>
 			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
 			<!-- issue and exe unit-->
 			<param name="ALU_per_core" value="1"/>
 			<!-- contains an adder, a shifter, and a logical unit -->
 			<param name="MUL_per_core" value="1"/>
 			<!-- For MUL and Div -->
 			<param name="FPU_per_core" value="0.125"/>		
 			<!-- buffer between IF and ID stage -->
 			<param name="instruction_buffer_size" value="16"/>
 			<!-- buffer between ID and sche/exe stage -->
 			<param name="decoded_stream_buffer_size" value="16"/>
 			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
 			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
 			<param name="instruction_window_size" value="16"/>
 			<param name="fp_instruction_window_size" value="16"/>
 			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
 			<param name="ROB_size" value="80"/>
 			<!-- each in-flight instruction has an entry in ROB -->
 			<!-- registers -->
 			<param name="archi_Regs_IRF_size" value="32"/>			
 			<param name="archi_Regs_FRF_size" value="32"/>
 			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
 			renaming logic is for both integer and floating point insts.  -->
 			<param name="phy_Regs_IRF_size" value="80"/>
 			<param name="phy_Regs_FRF_size" value="80"/>
 			<!-- rename logic -->
 			<param name="rename_scheme" value="0"/>
 			<!-- can be RAM based(0) or CAM based(1) rename scheme 
 			RAM-based scheme will have free list, status table;
 			CAM-based scheme have the valid bit in the data field of the CAM 
 			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
 			Detailed RAT Implementation see TR -->
 			<param name="register_windows_size" value="8"/>
 			<!-- how many windows in the windowed register file, sun processors;
 			no register windowing is used when this number is 0 -->
 			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
 			They will always try to exeute out-of-order though. -->
 			<param name="LSU_order" value="inorder"/>
 			<param name="store_buffer_size" value="32"/>
 			<!-- By default, in-order cores do not have load buffers -->
 			<param name="load_buffer_size" value="32"/>	
 			<!-- number of ports refer to sustainable concurrent memory accesses --> 
 			<param name="memory_ports" value="1"/>	
 			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
 			as well as the ports of Dcache which is connected to LSU -->	
 			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
 			<param name="RAS_size" value="32"/>						
 			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
 			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
 			<stat name="total_instructions" value="800000"/>
 			<stat name="int_instructions" value="600000"/>
 			<stat name="fp_instructions" value="20000"/>
 			<stat name="branch_instructions" value="0"/>
 			<stat name="branch_mispredictions" value="0"/>
 			<stat name="load_instructions" value="100000"/>
 			<stat name="store_instructions" value="100000"/>
 			<stat name="committed_instructions" value="800000"/>
 			<stat name="committed_int_instructions" value="600000"/>
 			<stat name="committed_fp_instructions" value="20000"/>
 			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
 			<!-- the following cycle stats are used for heterogeneouse cores only, 
 				please ignore them if homogeneouse cores -->
 			<stat name="total_cycles" value="100000"/>
 		    <stat name="idle_cycles" value="0"/>
 		    <stat name="busy_cycles"  value="100000"/>
 			<!-- instruction buffer stats -->
 			<!-- ROB stats, both RS and Phy based OoOs have ROB
 			performance simulator should capture the difference on accesses,
 			otherwise, McPAT has to guess based on number of commited instructions. -->
 			<stat name="ROB_reads" value="263886"/>
 			<stat name="ROB_writes" value="263886"/>
 			<!-- RAT accesses -->
 			<stat name="rename_accesses" value="263886"/>
 			<stat name="fp_rename_accesses" value="263886"/>
 			<!-- decode and rename stage use this, should be total ic - nop -->
 			<!-- Inst window stats -->
 			<stat name="inst_window_reads" value="263886"/>
 			<stat name="inst_window_writes" value="263886"/>
 			<stat name="inst_window_wakeup_accesses" value="263886"/>
 			<stat name="fp_inst_window_reads" value="263886"/>
 			<stat name="fp_inst_window_writes" value="263886"/>
 			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
 			<!--  RF accesses -->
 			<stat name="int_regfile_reads" value="1600000"/>
 			<stat name="float_regfile_reads" value="40000"/>
 			<stat name="int_regfile_writes" value="800000"/>
 			<stat name="float_regfile_writes" value="20000"/>
 			<!-- accesses to the working reg -->
 			<stat name="function_calls" value="5"/>
 			<stat name="context_switches" value="260343"/>
 			<!-- Number of Windowes switches (number of function calls and returns)-->
 			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
 			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
 			<stat name="ialu_accesses" value="800000"/>			
 			<stat name="fpu_accesses" value="10000"/>
 			<stat name="mul_accesses" value="100000"/>
 			<stat name="cdb_alu_accesses" value="1000000"/>
 			<stat name="cdb_mul_accesses" value="0"/>
 			<stat name="cdb_fpu_accesses" value="0"/>
 			<!-- multiple cycle accesses should be counted multiple times, 
 			otherwise, McPAT can use internal counter for different floating point instructions 
 			to get final accesses. But that needs detailed info for floating point inst mix -->
 			<!--  currently the performance simulator should 
 			make sure all the numbers are final numbers, 
 			including the explicit read/write accesses, 
 			and the implicite accesses such as replacements and etc.
 			Future versions of McPAT may be able to reason the implicite access
 			based on param and stats of last level cache
 			The same rule applies to all cache access stats too!  -->
 			<!-- following is AF for max power computation. 
 				Do not change them, unless you understand them-->
 			<stat name="IFU_duty_cycle" value="0.25"/>			
 			<stat name="LSU_duty_cycle" value="0.25"/>
 			<stat name="MemManU_I_duty_cycle" value="1"/>
 			<stat name="MemManU_D_duty_cycle" value="0.25"/>
 			<stat name="ALU_duty_cycle" value="0.9"/>
 			<stat name="MUL_duty_cycle" value="0.5"/>
 			<stat name="FPU_duty_cycle" value="0.4"/>
 			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
 			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
 			<stat name="FPU_cdb_duty_cycle" value="0.4"/>
 			<component id="system.core0.predictor" name="PBT">
 				<!-- branch predictor; tournament predictor see Alpha implementation -->
 				<param name="local_predictor_size" value="10,3"/>
 				<param name="local_predictor_entries" value="1024"/>
 				<param name="global_predictor_entries" value="4096"/>
 				<param name="global_predictor_bits" value="2"/>
 				<param name="chooser_predictor_entries" value="4096"/>
 				<param name="chooser_predictor_bits" value="2"/>
 				<!-- These parameters can be combined like below in next version
 				<param name="load_predictor" value="10,3,1024"/>
 				<param name="global_predictor" value="4096,2"/>
 				<param name="predictor_chooser" value="4096,2"/>
 				-->
 			</component>
 			<component id="system.core0.itlb" name="itlb">
 				<param name="number_entries" value="64"/>
 				<stat name="total_accesses" value="800000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
 				which is actually a replacement -->
 			</component>
 			<component id="system.core0.icache" name="icache">
 				<!-- there is no write requests to itlb although writes happen to it after miss, 
 				which is actually a replacement -->
 				<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
 				<param name="buffer_sizes" value="16, 16, 16,0"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
 				<stat name="read_accesses" value="200000"/>
 				<stat name="read_misses" value="0"/>
 				<stat name="conflicts" value="0"/>				
 			</component>
 			<component id="system.core0.dtlb" name="dtlb">
 				<param name="number_entries" value="64"/>
 				<stat name="total_accesses" value="200000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<component id="system.core0.dcache" name="dcache">
 			        <!-- all the buffer related are optional -->
 				<param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="200000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<component id="system.core0.BTB" name="BTB">
 			        <!-- all the buffer related are optional -->
 				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
 				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			</component>
 	</component>
 		<component id="system.L1Directory0" name="L1Directory0">
 				<param name="Directory_type" value="0"/>
 			    <!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank -->	
 				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="3500"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="800000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="20"/>	
 				<stat name="duty_cycle" value="0.45"/>	
 		</component>
 		<component id="system.L2Directory0" name="L2Directory0">
 				<param name="Directory_type" value="0"/>
 			    <!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank -->	
 				<param name="Dir_config" value="8388608,9,0,1,100, 100"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="3500"/>
 				<param name="ports" value="1,1,8"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="58824"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="100"/>
 			    <stat name="duty_cycle" value="0.45"/>		
 		</component>
 		<component id="system.L20" name="L20">
 			<!-- all the buffer related are optional -->
 			    <param name="merged_dir" value="1"/><!--if static bank tag is used as the directory -->
 				<param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/>
 			    <!-- consider 4-way bank interleaving for Niagara 1 -->
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<param name="clockrate" value="3500"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<stat name="read_accesses" value="200000"/>
 				<stat name="write_accesses" value="0"/>
 				<stat name="read_misses" value="0"/>
 				<stat name="write_misses" value="0"/>
 				<stat name="conflicts" value="0"/>	
 			    <stat name="duty_cycle" value="0.5"/>	
 				<stat name="coherent_read_accesses" value="400000"/>
 				<stat name="coherent_write_accesses" value="0"/>
 				<stat name="coherent_read_misses" value="400000"/>
 				<stat name="coherent_write_misses" value="0"/>
 			    <stat name="dir_duty_cycle" value="0.5"/>
 		</component>
 <!--**********************************************************************-->
 <component id="system.L30" name="L30">
 				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<param name="clockrate" value="3500"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="58824"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 	            <stat name="duty_cycle" value="0.35"/>
 				<param name="Merged_dir" value="1"/><!--if static bank tag is used as the directory -->
 				<stat name="coherent_read_accesses" value="400000"/>
 				<stat name="coherent_write_accesses" value="0"/>
 				<stat name="coherent_read_misses" value="400000"/>
 				<stat name="coherent_write_misses" value="0"/>
 			    <stat name="dir_duty_cycle" value="0.5"/>	
 		</component>
 <!--**********************************************************************-->
 		<component id="system.NoC0" name="noc0">
 			<param name="clockrate" value="3500"/>
 			<param name="type" value="1"/>
 			<!-- 1 NoC, O bus -->
 			<param name="horizontal_nodes" value="8"/>
 			<param name="vertical_nodes" value="8"/>
 			<param name="has_global_link" value="1"/>
 			<!-- 1 has global link, 0 does not have global link -->
 			<param name="link_throughput" value="1"/><!--w.r.t clock -->
 			<param name="link_latency" value="1"/><!--w.r.t clock -->
 			<!-- througput >= latency -->
 			<!-- Router architecture -->
 			<param name="input_ports" value="5"/>
 			<param name="output_ports" value="5"/>
 			<param name="virtual_channel_per_port" value="1"/>
 			<!-- input buffer; in classic routers only input ports need buffers -->
 			<param name="flit_bits" value="256"/>
 			<param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
 			<param name="chip_coverage" value="1"/>
 			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
 			<stat name="total_accesses" value="360000"/>
 			<!-- This is the number of total accesses within the whole network not for each router -->
 			<stat name="duty_cycle" value="0.1"/>
 		</component>
 <!--**********************************************************************-->
 		<component id="system.mem" name="mem">
 			<!-- Main memory property -->
 			<param name="mem_tech_node" value="32"/>
 			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
 			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
 			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
 			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
 			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
 			<!-- above numbers can be easily found from Wikipedia -->
 			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
 			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
 			Current McPAT assumes single DIMMs are used.--> 		
 			<param name="number_ranks" value="2"/>
 			<param name="num_banks_of_DRAM_chip" value="8"/>			
 			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
 			<param name="output_width_of_DRAM_chip" value="8"/>
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
 			<param name="burstlength_of_DRAM_chip" value="8"/>
 			<stat name="memory_accesses" value="1052"/>
 			<stat name="memory_reads" value="1052"/>
 			<stat name="memory_writes" value="1052"/>									
 		</component>
 		<component id="system.mc" name="mc">
 			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
 			<!-- current version of McPAT uses published values for base parameters of memory controller
 			improvments on MC will be added in later versions. -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
 			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
 			<param name="block_size" value="64"/><!--B-->
 			<param name="number_mcs" value="0"/>
 			<!-- current McPAT only supports homogeneous memory controllers -->
 			<param name="memory_channels_per_mc" value="1"/>
 			<param name="number_ranks" value="2"/>
 			<param name="withPHY" value="0"/>
 			<!-- # of ranks of each channel-->
 			<param name="req_window_size_per_channel" value="32"/>
 			<param name="IO_buffer_size_per_channel" value="32"/>
 			<param name="databus_width" value="128"/>
 			<param name="addressbus_width" value="51"/>
 			<!-- McPAT will add the control bus width to the addressbus width automatically -->
 			<stat name="memory_accesses" value="33333"/>
 			<stat name="memory_reads" value="16667"/>
 			<stat name="memory_writes" value="16667"/>
 			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
 			the average power per MC or per channel. This is sufficent for most application. 
 			Further trackdown can be easily added in later versions. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.niu" name="niu">
 			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
 			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
 				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
 			the average power per nic or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.pcie" name="pcie">
 			<!-- On chip PCIe controller, including Phy-->
 			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
 				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="withPHY" value="1"/>
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="0"/>
 			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
 			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.flashc" name="flashc">
 		    <param name="number_flashcs" value="0"/>
 			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
            <param name="withPHY" value="1"/>
 			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
 			the average power per fc or per channel. This is sufficent for most application -->  			
 		</component>
 <!--**********************************************************************-->
 		</component>
 </component>
--- a/ext/mcpat/Niagara1_sharing_ST.xml
+++ b/ext/mcpat/Niagara1_sharing_ST.xml
@ -0,0 +1,443 @@
 <?xml version="1.0" ?>
 <component id="root" name="root">
 	<component id="system" name="system">
 		<!--McPAT will skip the components if number is set to 0 -->
 		<param name="number_of_cores" value="64"/>
 		<param name="number_of_L1Directories" value="0"/>
 		<param name="number_of_L2Directories" value="1"/>
 		<param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
 		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
 		<param name="number_of_NoCs" value="1"/>
 		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
 		<param name="homogeneous_L2s" value="1"/>
 		<param name="homogeneous_L1Directorys" value="1"/>
 		<param name="homogeneous_L2Directorys" value="1"/>
 		<param name="homogeneous_L3s" value="1"/>
 		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
 		<param name="homogeneous_NoCs" value="1"/>
 		<param name="core_tech_node" value="22"/><!-- nm -->
 		<param name="target_core_clockrate" value="3500"/><!--MHz -->
 		<param name="temperature" value="360"/> <!-- Kelvin -->
 		<param name="number_cache_levels" value="2"/>
 		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
 		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
 		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
 		<param name="machine_bits" value="64"/>
 		<param name="virtual_address_width" value="64"/>
 		<param name="physical_address_width" value="52"/>
 		<param name="virtual_memory_page_size" value="4096"/>
 		<stat name="total_cycles" value="100000"/>
 		<stat name="idle_cycles" value="0"/>
 		<stat name="busy_cycles"  value="100000"/>
 			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
 			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
 		<!-- *********************** cores ******************* -->
 		<component id="system.core0" name="core0">
 			<!-- Core property -->
 			<param name="clock_rate" value="3500"/>
 			<param name="instruction_length" value="32"/>
 			<param name="opcode_width" value="9"/>
 			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
 			default value is machine_bits, if not set --> 
 			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
 			<!-- inorder/OoO -->
 			<param name="number_hardware_threads" value="4"/>
 			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
 			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
 			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
 			<param name="fetch_width" value="1"/>
 			<!-- fetch_width determins the size of cachelines of L1 cache block -->
 			<param name="number_instruction_fetch_ports" value="1"/>
 			<param name="decode_width" value="1"/>
 			<!-- decode_width determins the number of ports of the 
 			renaming table (both RAM and CAM) scheme -->
 			<param name="issue_width" value="1"/>
 			<!-- issue_width determins the number of ports of Issue window and other logic 
 			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
 			<param name="commit_width" value="1"/>
 			<!-- commit_width determins the number of ports of register files -->
 			<param name="fp_issue_width" value="1"/>
 			<param name="prediction_width" value="0"/> 
 			<!-- number of branch instructions can be predicted simultannouesl-->
 			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
 			Theses parameters are reserved for future use.--> 
 			<param name="pipelines_per_core" value="1,1"/>
 			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
 			<param name="pipeline_depth" value="6,6"/>
 			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
 			<!-- issue and exe unit-->
 			<param name="ALU_per_core" value="1"/>
 			<!-- contains an adder, a shifter, and a logical unit -->
 			<param name="MUL_per_core" value="1"/>
 			<!-- For MUL and Div -->
 			<param name="FPU_per_core" value="0.125"/>		
 			<!-- buffer between IF and ID stage -->
 			<param name="instruction_buffer_size" value="16"/>
 			<!-- buffer between ID and sche/exe stage -->
 			<param name="decoded_stream_buffer_size" value="16"/>
 			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
 			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
 			<param name="instruction_window_size" value="16"/>
 			<param name="fp_instruction_window_size" value="16"/>
 			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
 			<param name="ROB_size" value="80"/>
 			<!-- each in-flight instruction has an entry in ROB -->
 			<!-- registers -->
 			<param name="archi_Regs_IRF_size" value="32"/>			
 			<param name="archi_Regs_FRF_size" value="32"/>
 			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
 			renaming logic is for both integer and floating point insts.  -->
 			<param name="phy_Regs_IRF_size" value="80"/>
 			<param name="phy_Regs_FRF_size" value="80"/>
 			<!-- rename logic -->
 			<param name="rename_scheme" value="0"/>
 			<!-- can be RAM based(0) or CAM based(1) rename scheme 
 			RAM-based scheme will have free list, status table;
 			CAM-based scheme have the valid bit in the data field of the CAM 
 			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
 			Detailed RAT Implementation see TR -->
 			<param name="register_windows_size" value="8"/>
 			<!-- how many windows in the windowed register file, sun processors;
 			no register windowing is used when this number is 0 -->
 			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
 			They will always try to exeute out-of-order though. -->
 			<param name="LSU_order" value="inorder"/>
 			<param name="store_buffer_size" value="32"/>
 			<!-- By default, in-order cores do not have load buffers -->
 			<param name="load_buffer_size" value="32"/>	
 			<!-- number of ports refer to sustainable concurrent memory accesses --> 
 			<param name="memory_ports" value="1"/>	
 			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
 			as well as the ports of Dcache which is connected to LSU -->	
 			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
 			<param name="RAS_size" value="32"/>						
 			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
 			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
 			<stat name="total_instructions" value="800000"/>
 			<stat name="int_instructions" value="600000"/>
 			<stat name="fp_instructions" value="20000"/>
 			<stat name="branch_instructions" value="0"/>
 			<stat name="branch_mispredictions" value="0"/>
 			<stat name="load_instructions" value="100000"/>
 			<stat name="store_instructions" value="100000"/>
 			<stat name="committed_instructions" value="800000"/>
 			<stat name="committed_int_instructions" value="600000"/>
 			<stat name="committed_fp_instructions" value="20000"/>
 			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
 			<!-- the following cycle stats are used for heterogeneouse cores only, 
 				please ignore them if homogeneouse cores -->
 			<stat name="total_cycles" value="100000"/>
 		    <stat name="idle_cycles" value="0"/>
 		    <stat name="busy_cycles"  value="100000"/>
 			<!-- instruction buffer stats -->
 			<!-- ROB stats, both RS and Phy based OoOs have ROB
 			performance simulator should capture the difference on accesses,
 			otherwise, McPAT has to guess based on number of commited instructions. -->
 			<stat name="ROB_reads" value="263886"/>
 			<stat name="ROB_writes" value="263886"/>
 			<!-- RAT accesses -->
 			<stat name="rename_accesses" value="263886"/>
 			<stat name="fp_rename_accesses" value="263886"/>
 			<!-- decode and rename stage use this, should be total ic - nop -->
 			<!-- Inst window stats -->
 			<stat name="inst_window_reads" value="263886"/>
 			<stat name="inst_window_writes" value="263886"/>
 			<stat name="inst_window_wakeup_accesses" value="263886"/>
 			<stat name="fp_inst_window_reads" value="263886"/>
 			<stat name="fp_inst_window_writes" value="263886"/>
 			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
 			<!--  RF accesses -->
 			<stat name="int_regfile_reads" value="1600000"/>
 			<stat name="float_regfile_reads" value="40000"/>
 			<stat name="int_regfile_writes" value="800000"/>
 			<stat name="float_regfile_writes" value="20000"/>
 			<!-- accesses to the working reg -->
 			<stat name="function_calls" value="5"/>
 			<stat name="context_switches" value="260343"/>
 			<!-- Number of Windowes switches (number of function calls and returns)-->
 			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
 			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
 			<stat name="ialu_accesses" value="800000"/>			
 			<stat name="fpu_accesses" value="10000"/>
 			<stat name="mul_accesses" value="100000"/>
 			<stat name="cdb_alu_accesses" value="1000000"/>
 			<stat name="cdb_mul_accesses" value="0"/>
 			<stat name="cdb_fpu_accesses" value="0"/>
 			<!-- multiple cycle accesses should be counted multiple times, 
 			otherwise, McPAT can use internal counter for different floating point instructions 
 			to get final accesses. But that needs detailed info for floating point inst mix -->
 			<!--  currently the performance simulator should 
 			make sure all the numbers are final numbers, 
 			including the explicit read/write accesses, 
 			and the implicite accesses such as replacements and etc.
 			Future versions of McPAT may be able to reason the implicite access
 			based on param and stats of last level cache
 			The same rule applies to all cache access stats too!  -->
 			<!-- following is AF for max power computation. 
 				Do not change them, unless you understand them-->
 			<stat name="IFU_duty_cycle" value="0.25"/>			
 			<stat name="LSU_duty_cycle" value="0.25"/>
 			<stat name="MemManU_I_duty_cycle" value="1"/>
 			<stat name="MemManU_D_duty_cycle" value="0.25"/>
 			<stat name="ALU_duty_cycle" value="0.9"/>
 			<stat name="MUL_duty_cycle" value="0.5"/>
 			<stat name="FPU_duty_cycle" value="0.4"/>
 			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
 			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
 			<stat name="FPU_cdb_duty_cycle" value="0.4"/>
 			<component id="system.core0.predictor" name="PBT">
 				<!-- branch predictor; tournament predictor see Alpha implementation -->
 				<param name="local_predictor_size" value="10,3"/>
 				<param name="local_predictor_entries" value="1024"/>
 				<param name="global_predictor_entries" value="4096"/>
 				<param name="global_predictor_bits" value="2"/>
 				<param name="chooser_predictor_entries" value="4096"/>
 				<param name="chooser_predictor_bits" value="2"/>
 				<!-- These parameters can be combined like below in next version
 				<param name="load_predictor" value="10,3,1024"/>
 				<param name="global_predictor" value="4096,2"/>
 				<param name="predictor_chooser" value="4096,2"/>
 				-->
 			</component>
 			<component id="system.core0.itlb" name="itlb">
 				<param name="number_entries" value="64"/>
 				<stat name="total_accesses" value="800000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
 				which is actually a replacement -->
 			</component>
 			<component id="system.core0.icache" name="icache">
 				<!-- there is no write requests to itlb although writes happen to it after miss, 
 				which is actually a replacement -->
 				<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
 				<param name="buffer_sizes" value="16, 16, 16,0"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
 				<stat name="read_accesses" value="200000"/>
 				<stat name="read_misses" value="0"/>
 				<stat name="conflicts" value="0"/>				
 			</component>
 			<component id="system.core0.dtlb" name="dtlb">
 				<param name="number_entries" value="64"/>
 				<stat name="total_accesses" value="200000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<component id="system.core0.dcache" name="dcache">
 			        <!-- all the buffer related are optional -->
 				<param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="200000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<component id="system.core0.BTB" name="BTB">
 			        <!-- all the buffer related are optional -->
 				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
 				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			</component>
 	</component>
 		<component id="system.L1Directory0" name="L1Directory0">
 				<param name="Directory_type" value="0"/>
 			    <!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank -->	
 				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="3500"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="800000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="20"/>	
 				<stat name="duty_cycle" value="0.45"/>	
 		</component>
 		<component id="system.L2Directory0" name="L2Directory0">
 				<param name="Directory_type" value="0"/>
 			    <!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank -->	
 				<param name="Dir_config" value="8388608,9,0,1,100, 100"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="3500"/>
 				<param name="ports" value="0,0,8"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="58824"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="100"/>
 			    <stat name="duty_cycle" value="0.45"/>		
 		</component>
 		<component id="system.L20" name="L20">
 			<!-- all the buffer related are optional -->
 				<param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/>
 			    <param name="Merged_dir" value="1"/>
 			    <!-- consider 4-way bank interleaving for Niagara 1 -->
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<param name="clockrate" value="3500"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<stat name="read_accesses" value="200000"/>
 				<stat name="write_accesses" value="0"/>
 				<stat name="read_misses" value="0"/>
 				<stat name="write_misses" value="0"/>
 				<stat name="conflicts" value="0"/>	
 			    <stat name="duty_cycle" value="0.5"/>	
 		</component>
 <!--**********************************************************************-->
 <component id="system.L30" name="L30">
 				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<param name="Merged_dir" value="1"/>
 				<param name="clockrate" value="3500"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="58824"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 	            <stat name="duty_cycle" value="0.35"/>	
 		</component>
 <!--**********************************************************************-->
 		<component id="system.NoC0" name="noc0">
 			<param name="clockrate" value="3500"/>
 			<param name="type" value="1"/>
 			<!-- 1 NoC, O bus -->
 			<param name="horizontal_nodes" value="8"/>
 			<param name="vertical_nodes" value="8"/>
 			<param name="has_global_link" value="1"/>
 			<!-- 1 has global link, 0 does not have global link -->
 			<param name="link_throughput" value="1"/><!--w.r.t clock -->
 			<param name="link_latency" value="1"/><!--w.r.t clock -->
 			<!-- througput >= latency -->
 			<!-- Router architecture -->
 			<param name="input_ports" value="5"/>
 			<param name="output_ports" value="5"/>
 			<param name="virtual_channel_per_port" value="1"/>
 			<!-- input buffer; in classic routers only input ports need buffers -->
 			<param name="flit_bits" value="256"/>
 			<param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
 			<param name="chip_coverage" value="1"/>
 			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
 			<stat name="total_accesses" value="360000"/>
 			<!-- This is the number of total accesses within the whole network not for each router -->
 			<stat name="duty_cycle" value="0.1"/>
 		</component>
 <!--**********************************************************************-->
 		<component id="system.mem" name="mem">
 			<!-- Main memory property -->
 			<param name="mem_tech_node" value="32"/>
 			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
 			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
 			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
 			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
 			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
 			<!-- above numbers can be easily found from Wikipedia -->
 			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
 			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
 			Current McPAT assumes single DIMMs are used.--> 		
 			<param name="number_ranks" value="2"/>
 			<param name="num_banks_of_DRAM_chip" value="8"/>			
 			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
 			<param name="output_width_of_DRAM_chip" value="8"/>
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
 			<param name="burstlength_of_DRAM_chip" value="8"/>
 			<stat name="memory_accesses" value="1052"/>
 			<stat name="memory_reads" value="1052"/>
 			<stat name="memory_writes" value="1052"/>									
 		</component>
 		<component id="system.mc" name="mc">
 			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
 			<!-- current version of McPAT uses published values for base parameters of memory controller
 			improvments on MC will be added in later versions. -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
 			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
 			<param name="block_size" value="64"/><!--B-->
 			<param name="number_mcs" value="0"/>
 			<!-- current McPAT only supports homogeneous memory controllers -->
 			<param name="memory_channels_per_mc" value="1"/>
 			<param name="number_ranks" value="2"/>
 			<param name="withPHY" value="0"/>
 			<!-- # of ranks of each channel-->
 			<param name="req_window_size_per_channel" value="32"/>
 			<param name="IO_buffer_size_per_channel" value="32"/>
 			<param name="databus_width" value="128"/>
 			<param name="addressbus_width" value="51"/>
 			<!-- McPAT will add the control bus width to the addressbus width automatically -->
 			<stat name="memory_accesses" value="33333"/>
 			<stat name="memory_reads" value="16667"/>
 			<stat name="memory_writes" value="16667"/>
 			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
 			the average power per MC or per channel. This is sufficent for most application. 
 			Further trackdown can be easily added in later versions. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.niu" name="niu">
 			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
 			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
 				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
 			the average power per nic or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.pcie" name="pcie">
 			<!-- On chip PCIe controller, including Phy-->
 			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
 				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="withPHY" value="1"/>
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="0"/>
 			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
 			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.flashc" name="flashc">
 		    <param name="number_flashcs" value="0"/>
 			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
            <param name="withPHY" value="1"/>
 			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
 			the average power per fc or per channel. This is sufficent for most application -->  			
 		</component>
 <!--**********************************************************************-->
 		</component>
 </component>
--- a/ext/mcpat/Niagara2.xml
+++ b/ext/mcpat/Niagara2.xml
@ -0,0 +1,438 @@
 <?xml version="1.0" ?>
 <component id="root" name="root">
 	<component id="system" name="system">
 		<!--McPAT will skip the components if number is set to 0 -->
 		<param name="number_of_cores" value="8"/>
 		<param name="number_of_L1Directories" value="8"/>
 		<param name="number_of_L2Directories" value="0"/>
 		<param name="number_of_L2s" value="8"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
 		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
 		<param name="number_of_NoCs" value="1"/>
 		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
 		<param name="homogeneous_L2s" value="1"/>
 		<param name="homogeneous_L1Directorys" value="1"/>
 		<param name="homogeneous_L2Directorys" value="1"/>
 		<param name="homogeneous_L3s" value="1"/>
 		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
 		<param name="homogeneous_NoCs" value="1"/>
 		<param name="core_tech_node" value="65"/><!-- nm -->
 		<param name="target_core_clockrate" value="1400"/><!--MHz -->
 		<param name="temperature" value="380"/> <!-- Kelvin -->
 		<param name="number_cache_levels" value="2"/>
 		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
 		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
 		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
 		<param name="machine_bits" value="64"/>
 		<param name="virtual_address_width" value="64"/>
 		<param name="physical_address_width" value="52"/>
 		<param name="virtual_memory_page_size" value="4096"/>
 		<stat name="total_cycles" value="100000"/>
 		<stat name="idle_cycles" value="0"/>
 		<stat name="busy_cycles"  value="100000"/>
 			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
 			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
 		<!-- *********************** cores ******************* -->
 		<component id="system.core0" name="core0">
 			<!-- Core property -->
 			<param name="clock_rate" value="1400"/>
 			<param name="instruction_length" value="32"/>
 			<param name="opcode_width" value="9"/>
 			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
 			default value is machine_bits, if not set --> 
 			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
 			<!-- inorder/OoO -->
 			<param name="number_hardware_threads" value="4"/>
 			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
 			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
 			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
 			<param name="fetch_width" value="1"/>
 			<!-- fetch_width determins the size of cachelines of L1 cache block -->
 			<param name="number_instruction_fetch_ports" value="1"/>
 			<param name="decode_width" value="1"/>
 			<!-- decode_width determins the number of ports of the 
 			renaming table (both RAM and CAM) scheme -->
 			<param name="issue_width" value="1"/>
 			<!-- issue_width determins the number of ports of Issue window and other logic 
 			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
 			<param name="commit_width" value="1"/>
 			<!-- commit_width determins the number of ports of register files -->
 			<param name="fp_issue_width" value="1"/>
 			<param name="prediction_width" value="0"/> 
 			<!-- number of branch instructions can be predicted simultannouesl-->
 			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
 			Theses parameters are reserved for future use.--> 
 			<param name="pipelines_per_core" value="2,1"/>
 			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
 			<param name="pipeline_depth" value="8,8"/>
 			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
 			<!-- issue and exe unit-->
 			<param name="ALU_per_core" value="2"/>
 			<!-- contains an adder, a shifter, and a logical unit -->
 			<param name="MUL_per_core" value="0"/>
 			<!-- For MUL and Div -->
 			<param name="FPU_per_core" value="1"/>		
 			<!-- buffer between IF and ID stage -->
 			<param name="instruction_buffer_size" value="32"/>
 			<!-- buffer between ID and sche/exe stage -->
 			<param name="decoded_stream_buffer_size" value="16"/>
 			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
 			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
 			<param name="instruction_window_size" value="16"/>
 			<param name="fp_instruction_window_size" value="16"/>
 			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
 			<param name="ROB_size" value="80"/>
 			<!-- each in-flight instruction has an entry in ROB -->
 			<!-- registers -->
 			<param name="archi_Regs_IRF_size" value="32"/>			
 			<param name="archi_Regs_FRF_size" value="32"/>
 			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
 			renaming logic is for both integer and floating point insts.  -->
 			<param name="phy_Regs_IRF_size" value="80"/>
 			<param name="phy_Regs_FRF_size" value="80"/>
 			<!-- rename logic -->
 			<param name="rename_scheme" value="0"/>
 			<!-- can be RAM based(0) or CAM based(1) rename scheme 
 			RAM-based scheme will have free list, status table;
 			CAM-based scheme have the valid bit in the data field of the CAM 
 			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
 			Detailed RAT Implementation see TR -->
 			<param name="register_windows_size" value="8"/>
 			<!-- how many windows in the windowed register file, sun processors;
 			no register windowing is used when this number is 0 -->
 			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
 			They will always try to exeute out-of-order though. -->
 			<param name="LSU_order" value="inorder"/>
 			<param name="store_buffer_size" value="64"/>
 			<!-- By default, in-order cores do not have load buffers -->
 			<param name="load_buffer_size" value="64"/>	
 			<!-- number of ports refer to sustainable concurrent memory accesses --> 
 			<param name="memory_ports" value="1"/>	
 			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
 			as well as the ports of Dcache which is connected to LSU -->	
 			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
 			<param name="RAS_size" value="32"/>						
 			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
 			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
 			<stat name="total_instructions" value="1600000"/>
 			<stat name="int_instructions" value="1200000"/>
 			<stat name="fp_instructions" value="40000"/>
 			<stat name="branch_instructions" value="0"/>
 			<stat name="branch_mispredictions" value="0"/>
 			<stat name="load_instructions" value="200000"/>
 			<stat name="store_instructions" value="200000"/>
 			<stat name="committed_instructions" value="1600000"/>
 			<stat name="committed_int_instructions" value="1200000"/>
 			<stat name="committed_fp_instructions" value="40000"/>
 			<stat name="pipeline_duty_cycle" value="0.5"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
 			<!-- the following cycle stats are used for heterogeneouse cores only, 
 				please ignore them if homogeneouse cores -->
 			<stat name="total_cycles" value="100000"/>
 		    <stat name="idle_cycles" value="0"/>
 		    <stat name="busy_cycles"  value="100000"/>
 			<!-- instruction buffer stats -->
 			<!-- ROB stats, both RS and Phy based OoOs have ROB
 			performance simulator should capture the difference on accesses,
 			otherwise, McPAT has to guess based on number of commited instructions. -->
 			<stat name="ROB_reads" value="263886"/>
 			<stat name="ROB_writes" value="263886"/>
 			<!-- RAT accesses -->
 			<stat name="rename_accesses" value="263886"/>
 			<stat name="fp_rename_accesses" value="263886"/>
 			<!-- decode and rename stage use this, should be total ic - nop -->
 			<!-- Inst window stats -->
 			<stat name="inst_window_reads" value="263886"/>
 			<stat name="inst_window_writes" value="263886"/>
 			<stat name="inst_window_wakeup_accesses" value="263886"/>
 			<stat name="fp_inst_window_reads" value="263886"/>
 			<stat name="fp_inst_window_writes" value="263886"/>
 			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
 			<!--  RF accesses -->
 			<stat name="int_regfile_reads" value="3200000"/>
 			<stat name="float_regfile_reads" value="80000"/>
 			<stat name="int_regfile_writes" value="1600000"/>
 			<stat name="float_regfile_writes" value="40000"/>
 			<!-- accesses to the working reg -->
 			<stat name="function_calls" value="5"/>
 			<stat name="context_switches" value="260343"/>
 			<!-- Number of Windowes switches (number of function calls and returns)-->
 			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
 			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
 			<stat name="ialu_accesses" value="1600000"/>			
 			<stat name="fpu_accesses" value="10000"/>
 			<stat name="mul_accesses" value="100000"/>
 			<stat name="cdb_alu_accesses" value="1200000"/>
 			<stat name="cdb_mul_accesses" value="0"/>
 			<stat name="cdb_fpu_accesses" value="0"/>
 			<!-- multiple cycle accesses should be counted multiple times, 
 			otherwise, McPAT can use internal counter for different floating point instructions 
 			to get final accesses. But that needs detailed info for floating point inst mix -->
 			<!--  currently the performance simulator should 
 			make sure all the numbers are final numbers, 
 			including the explicit read/write accesses, 
 			and the implicite accesses such as replacements and etc.
 			Future versions of McPAT may be able to reason the implicite access
 			based on param and stats of last level cache
 			The same rule applies to all cache access stats too!  -->
 			<!-- following is AF for max power computation. 
 				Do not change them, unless you understand them-->
 			<stat name="IFU_duty_cycle" value="0.5"/>			
 			<stat name="LSU_duty_cycle" value="0.25"/>
 			<stat name="MemManU_I_duty_cycle" value="0.5"/>
 			<stat name="MemManU_D_duty_cycle" value="0.25"/>
 			<stat name="ALU_duty_cycle" value="0.9"/>
 			<stat name="MUL_duty_cycle" value="0"/>
 			<stat name="FPU_duty_cycle" value="0.6"/>
 			<!--FPU also handles Mul/div -->
 			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
 			<stat name="MUL_cdb_duty_cycle" value="0"/>
 			<stat name="FPU_cdb_duty_cycle" value="0.6"/>	
 			<component id="system.core0.predictor" name="PBT">
 				<!-- branch predictor; tournament predictor see Alpha implementation -->
 				<param name="local_predictor_size" value="10,3"/>
 				<param name="local_predictor_entries" value="1024"/>
 				<param name="global_predictor_entries" value="4096"/>
 				<param name="global_predictor_bits" value="2"/>
 				<param name="chooser_predictor_entries" value="4096"/>
 				<param name="chooser_predictor_bits" value="2"/>
 				<!-- These parameters can be combined like below in next version
 				<param name="load_predictor" value="10,3,1024"/>
 				<param name="global_predictor" value="4096,2"/>
 				<param name="predictor_chooser" value="4096,2"/>
 				-->
 			</component>
 			<component id="system.core0.itlb" name="itlb">
 				<param name="number_entries" value="64"/>
 				<stat name="total_accesses" value="800000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
 				which is actually a replacement -->
 			</component>
 			<component id="system.core0.icache" name="icache">
 				<!-- there is no write requests to itlb although writes happen to it after miss, 
 				which is actually a replacement -->
 				<param name="icache_config" value="16384,32,8,1,1,7,8,0"/>
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
 				<param name="buffer_sizes" value="16, 16, 16,0"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
 				<stat name="read_accesses" value="200000"/>
 				<stat name="read_misses" value="0"/>
 				<stat name="conflicts" value="0"/>				
 			</component>
 			<component id="system.core0.dtlb" name="dtlb">
 				<param name="number_entries" value="128"/>
 				<stat name="total_accesses" value="200000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<component id="system.core0.dcache" name="dcache">
 			        <!-- all the buffer related are optional -->
 				<param name="dcache_config" value="8192,16,4,1, 1,3, 16,0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="200000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<component id="system.core0.BTB" name="BTB">
 			        <!-- all the buffer related are optional -->
 				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
 				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			</component>
 	</component>
 		<component id="system.L1Directory0" name="L1Directory0">
 				<param name="Directory_type" value="0"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="1024,2,0,1,1,1, 8"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="1400"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="800000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="20"/>	
 		</component>
 		<component id="system.L2Directory0" name="L2Directory0">
 				<param name="Directory_type" value="1"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="1400"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="58824"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="100"/>			    
 		</component>
 		<component id="system.L20" name="L20">
 			<!-- all the buffer related are optional -->
 				<param name="L2_config" value="524228,64,16,1, 8,23, 64,1"/>
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<param name="clockrate" value="1400"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<stat name="read_accesses" value="400000"/>
 				<stat name="write_accesses" value="0"/>
 				<stat name="read_misses" value="0"/>
 				<stat name="write_misses" value="0"/>
 				<stat name="conflicts" value="0"/>	
 			    <stat name="duty_cycle" value="1"/>	
 		</component>
 <!--**********************************************************************-->
 <component id="system.L30" name="L30">
 				<param name="L3_config" value="1048576,64,16,1, 2,100, 64, 1"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 				<param name="clockrate" value="3500"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="58824"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			    <stat name="duty_cycle" value="0.35"/>				
 		</component>
 <!--**********************************************************************-->
 		<component id="system.NoC0" name="noc0">
 			<param name="clockrate" value="1400"/>
 			<param name="horizontal_nodes" value="2"/>
 			<param name="vertical_nodes" value="1"/>
 			<param name="has_global_link" value="0"/>
 			<!-- 1 has global link, 0 does not have global link -->
 			<param name="link_throughput" value="1"/><!--w.r.t clock -->
 			<param name="link_latency" value="1"/><!--w.r.t clock -->
 			<!-- througput >= latency -->
 			<!-- Router architecture -->
 			<param name="input_ports" value="9"/>
 			<param name="output_ports" value="8"/>
 			<param name="virtual_channel_per_port" value="1"/>
 			<!-- input buffer; in classic routers only input ports need buffers -->
 			<param name="flit_bits" value="136"/>
 			<param name="input_buffer_entries_per_vc" value="16"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
 			<param name="chip_coverage" value="1"/>
 			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
 			<stat name="total_accesses" value="160000"/>
 			<!-- This is the number of total accesses within the whole network not for each router -->
 		    <stat name="duty_cycle" value="0.1"/>
 		</component>
 <!--**********************************************************************-->
 		<component id="system.mem" name="mem">
 			<!-- Main memory property -->
 			<param name="mem_tech_node" value="32"/>
 			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
 			<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
 			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
 			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
 			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
 			<!-- above numbers can be easily found from Wikipedia -->
 			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
 			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
 			Current McPAT assumes single DIMMs are used.--> 		
 			<param name="number_ranks" value="2"/>
 			<param name="num_banks_of_DRAM_chip" value="8"/>			
 			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
 			<param name="output_width_of_DRAM_chip" value="8"/>
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
 			<param name="burstlength_of_DRAM_chip" value="8"/>
 			<stat name="memory_accesses" value="1052"/>
 			<stat name="memory_reads" value="1052"/>
 			<stat name="memory_writes" value="1052"/>									
 		</component>
 		<component id="system.mc" name="mc">
 			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
 			<!-- current version of McPAT uses published values for base parameters of memory controller
 			improvments on MC will be added in later versions. -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="mc_clock" value="400"/><!--MHz-->
 			<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
 			<param name="block_size" value="64"/><!--(B) the block size of last level cache, which is the unit for one memory burst transfer -->
 			<param name="number_mcs" value="4"/>
 			<!-- current McPAT only supports homogeneous memory controllers -->
 			<param name="memory_channels_per_mc" value="1"/>
 			<param name="number_ranks" value="2"/>
 			<param name="withPHY" value="0"/>
 			<!-- # of ranks of each channel-->
 			<param name="req_window_size_per_channel" value="32"/>
 			<param name="IO_buffer_size_per_channel" value="32"/>
 			<param name="databus_width" value="128"/>
 			<param name="addressbus_width" value="51"/>
 			<!-- McPAT will add the control bus width to the addressbus width automatically -->
 			<stat name="memory_accesses" value="66666"/>
 			<stat name="memory_reads" value="33333"/>
 			<stat name="memory_writes" value="33333"/>
 			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
 			the average power per MC or per channel. This is sufficent for most application. 
 			Further trackdown can be easily added in later versions. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.niu" name="niu">
 			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
 			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
 				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="2"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
 			the average power per nic or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.pcie" name="pcie">
 			<!-- On chip PCIe controller, including Phy-->
 			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
 				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="withPHY" value="1"/>
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="1"/>
 			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
 			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.flashc" name="flashc">
 		    <param name="number_flashcs" value="0"/>
 			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
            <param name="withPHY" value="1"/>
 			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
 			the average power per fc or per channel. This is sufficent for most application -->  			
 		</component>
 <!--**********************************************************************-->
 		</component>
 </component>
--- a/ext/mcpat/Penryn.xml
+++ b/ext/mcpat/Penryn.xml
@ -0,0 +1,456 @@
 <?xml version="1.0" ?>
 <component id="root" name="root">
 	<component id="system" name="system">
 		<!--McPAT will skip the components if number is set to 0 -->
 		<param name="number_of_cores" value="2"/>
 		<param name="number_of_L1Directories" value="0"/>
 		<param name="number_of_L2Directories" value="0"/>
 		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
 		<param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent -->
 		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
 		<param name="number_of_NoCs" value="1"/>
 		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
 		<param name="homogeneous_L2s" value="1"/>
 		<param name="homogeneous_L1Directorys" value="1"/>
 		<param name="homogeneous_L2Directorys" value="1"/>
 		<param name="homogeneous_L3s" value="1"/>
 		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
 		<param name="homogeneous_NoCs" value="1"/>
 		<param name="core_tech_node" value="45"/><!-- nm -->
 		<param name="target_core_clockrate" value="3700"/><!--MHz -->
 		<param name="temperature" value="380"/> <!-- Kelvin -->
 		<param name="number_cache_levels" value="2"/>
 		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
 		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
 		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate -->
 		<param name="machine_bits" value="64"/>
 		<param name="virtual_address_width" value="64"/>
 		<param name="physical_address_width" value="52"/>
 		<param name="virtual_memory_page_size" value="4096"/>
 		<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
 			default value is machine_bits, if not set --> 
 		<stat name="total_cycles" value="100000"/>
 		<stat name="idle_cycles" value="0"/>
 		<stat name="busy_cycles"  value="100000"/>
 			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
 			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
 		<!-- *********************** cores ******************* -->
 		<component id="system.core0" name="core0">
 			<!-- Core property -->
 			<param name="clock_rate" value="3700"/>
 			<!-- for cores with unknow timing, set to 0 to force off the opt flag -->
 			<param name="opt_local" value="1"/>
 			<param name="instruction_length" value="32"/>
 			<param name="opcode_width" value="16"/>
 			<param name="x86" value="1"/>
 			<param name="micro_opcode_width" value="8"/>
 			<param name="machine_type" value="0"/>
 			<!-- inorder/OoO; 1 inorder; 0 OOO-->
 			<param name="number_hardware_threads" value="1"/>
 			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
 			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
 			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
 			<param name="fetch_width" value="4"/>
 			<!-- fetch_width determins the size of cachelines of L1 cache block -->
 			<param name="number_instruction_fetch_ports" value="1"/>
 			<param name="decode_width" value="4"/>
 			<!-- decode_width determins the number of ports of the 
 			renaming table (both RAM and CAM) scheme -->
 			<param name="issue_width" value="4"/>
 			<param name="peak_issue_width" value="6"/><!--As shown in Wiki figure which has max 5 ports, store data/address is modeled 
 														  as a single port.-->
 			<!-- issue_width determins the number of ports of Issue window and other logic 
 			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
 			<param name="commit_width" value="4"/>
 			<!-- commit_width determins the number of ports of register files -->
 			<param name="fp_issue_width" value="2"/>
 			<param name="prediction_width" value="1"/> 
 			<!-- number of branch instructions can be predicted simultannouesl-->
 			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
 			Theses parameters are reserved for future use.--> 
 			<param name="pipelines_per_core" value="1,1"/>
 			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
 			<param name="pipeline_depth" value="14,14"/>
 			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
 			<!-- issue and exe unit-->
 			<param name="ALU_per_core" value="6"/>
 			<!-- contains an adder, a shifter, and a logical unit -->
 			<param name="MUL_per_core" value="1"/>
 			<!-- For MUL and Div -->
 			<param name="FPU_per_core" value="2"/>		
 			<!-- buffer between IF and ID stage -->
 			<param name="instruction_buffer_size" value="32"/><!--Inst. + micro-op -->
 			<!-- buffer between ID and sche/exe stage -->
 			<param name="decoded_stream_buffer_size" value="16"/>
 			<param name="instruction_window_scheme" value="1"/><!-- 0 PHYREG based, 1 RSBASED-->
 			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
 			<param name="instruction_window_size" value="32"/>
 			<param name="fp_instruction_window_size" value="32"/>
 			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
 			<param name="ROB_size" value="96"/>
 			<!-- each in-flight instruction has an entry in ROB -->
 			<!-- registers -->
 			<param name="archi_Regs_IRF_size" value="16"/><!-- X86-64 has 16GPR -->			
 			<param name="archi_Regs_FRF_size" value="32"/><!-- MMX + XMM -->
 			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
 			renaming logic is for both integer and floating point insts.  -->
 			<param name="phy_Regs_IRF_size" value="256"/>
 			<param name="phy_Regs_FRF_size" value="256"/>
 			<!-- rename logic -->
 			<param name="rename_scheme" value="0"/>
 			<!-- can be RAM based(0) or CAM based(1) rename scheme 
 			RAM-based scheme will have free list, status table;
 			CAM-based scheme have the valid bit in the data field of the CAM 
 			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
 			Detailed RAT Implementation see TR -->
 			<param name="register_windows_size" value="0"/>
 			<!-- how many windows in the windowed register file, sun processors;
 			no register windowing is used when this number is 0 -->
 			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
 			They will always try to exeute out-of-order though. -->
 			<param name="LSU_order" value="inorder"/>
 			<param name="store_buffer_size" value="96"/>
 			<!-- By default, in-order cores do not have load buffers -->
 			<param name="load_buffer_size" value="48"/>	
 			<!-- number of ports refer to sustainable concurrent memory accesses --> 
 			<param name="memory_ports" value="2"/>	
 			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
 			as well as the ports of Dcache which is connected to LSU -->	
 			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
 			<param name="RAS_size" value="64"/>						
 			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
 			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
 			<stat name="total_instructions" value="400000"/>
 			<stat name="int_instructions" value="200000"/>
 			<stat name="fp_instructions" value="100000"/>
 			<stat name="branch_instructions" value="100000"/>
 			<stat name="branch_mispredictions" value="0"/>
 			<stat name="load_instructions" value="0"/>
 			<stat name="store_instructions" value="50000"/>
 			<stat name="committed_instructions" value="400000"/>
 			<stat name="committed_int_instructions" value="200000"/>
 			<stat name="committed_fp_instructions" value="100000"/>
 			<stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
 			<!-- the following cycle stats are used for heterogeneouse cores only, 
 				please ignore them if homogeneouse cores -->
 			<stat name="total_cycles" value="100000"/>
 		    <stat name="idle_cycles" value="0"/>
 		    <stat name="busy_cycles"  value="100000"/>
 			<!-- instruction buffer stats -->
 			<!-- ROB stats, both RS and Phy based OoOs have ROB
 			performance simulator should capture the difference on accesses,
 			otherwise, McPAT has to guess based on number of commited instructions. -->
 			<stat name="ROB_reads" value="400000"/>
 			<stat name="ROB_writes" value="400000"/>
 			<!-- RAT accesses -->
 			<stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
 			<stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
 			<stat name="fp_rename_reads" value="200000"/>
 			<stat name="fp_rename_writes" value="100000"/>
 			<!-- decode and rename stage use this, should be total ic - nop -->
 			<!-- Inst window stats -->
 			<stat name="inst_window_reads" value="400000"/>
 			<stat name="inst_window_writes" value="400000"/>
 			<stat name="inst_window_wakeup_accesses" value="800000"/>
 			<stat name="fp_inst_window_reads" value="200000"/>
 			<stat name="fp_inst_window_writes" value="200000"/>
 			<stat name="fp_inst_window_wakeup_accesses" value="400000"/>
 			<!--  RF accesses -->
 			<stat name="int_regfile_reads" value="600000"/>
 			<stat name="float_regfile_reads" value="100000"/>
 			<stat name="int_regfile_writes" value="300000"/>
 			<stat name="float_regfile_writes" value="50000"/>
 			<!-- accesses to the working reg -->
 			<stat name="function_calls" value="5"/>
 			<stat name="context_switches" value="260343"/>
 			<!-- Number of Windowes switches (number of function calls and returns)-->
 			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
 			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
 			<stat name="ialu_accesses" value="300000"/>			
 			<stat name="fpu_accesses" value="100000"/>
 			<stat name="mul_accesses" value="200000"/>
 			<stat name="cdb_alu_accesses" value="300000"/>
 			<stat name="cdb_mul_accesses" value="200000"/>
 			<stat name="cdb_fpu_accesses" value="100000"/>
 			<!-- multiple cycle accesses should be counted multiple times, 
 			otherwise, McPAT can use internal counter for different floating point instructions 
 			to get final accesses. But that needs detailed info for floating point inst mix -->
 			<!--  currently the performance simulator should 
 			make sure all the numbers are final numbers, 
 			including the explicit read/write accesses, 
 			and the implicite accesses such as replacements and etc.
 			Future versions of McPAT may be able to reason the implicite access
 			based on param and stats of last level cache
 			The same rule applies to all cache access stats too!  -->
 			<!-- following is AF for max power computation. 
 				Do not change them, unless you understand them-->
 			<stat name="IFU_duty_cycle" value="1"/>			
 			<stat name="LSU_duty_cycle" value="0.5"/>
 			<stat name="MemManU_I_duty_cycle" value="1"/>
 			<stat name="MemManU_D_duty_cycle" value="0.5"/>
 			<stat name="ALU_duty_cycle" value="1"/>
 			<stat name="MUL_duty_cycle" value="0.3"/>
 			<stat name="FPU_duty_cycle" value="0.3"/>
 			<stat name="ALU_cdb_duty_cycle" value="1"/>
 			<stat name="MUL_cdb_duty_cycle" value="0.3"/>
 			<stat name="FPU_cdb_duty_cycle" value="0.3"/>
 			<param name="number_of_BPT" value="2"/>
 			<component id="system.core0.predictor" name="PBT">
 				<!-- branch predictor; tournament predictor see Alpha implementation -->
 				<param name="local_predictor_size" value="10,3"/>
 				<param name="local_predictor_entries" value="1024"/>
 				<param name="global_predictor_entries" value="4096"/>
 				<param name="global_predictor_bits" value="2"/>
 				<param name="chooser_predictor_entries" value="4096"/>
 				<param name="chooser_predictor_bits" value="2"/>
 				<!-- These parameters can be combined like below in next version
 				<param name="load_predictor" value="10,3,1024"/>
 				<param name="global_predictor" value="4096,2"/>
 				<param name="predictor_chooser" value="4096,2"/>
 				-->
 			</component>
 			<component id="system.core0.itlb" name="itlb">
 				<param name="number_entries" value="128"/>
 				<stat name="total_accesses" value="200000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
 				which is actually a replacement -->
 			</component>
 			<component id="system.core0.icache" name="icache">
 				<!-- there is no write requests to itlb although writes happen to it after miss, 
 				which is actually a replacement -->
 				<param name="icache_config" value="32768,32,8,1,4,4,32,0"/>
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy,  -->
 				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
 				<param name="buffer_sizes" value="16, 16, 16,0"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
 				<stat name="read_accesses" value="200000"/>
 				<stat name="read_misses" value="0"/>
 				<stat name="conflicts" value="0"/>				
 			</component>
 			<component id="system.core0.dtlb" name="dtlb">
 				<param name="number_entries" value="256"/><!--dual threads-->
 				<stat name="total_accesses" value="400000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<component id="system.core0.dcache" name="dcache">
 			        <!-- all the buffer related are optional -->
 				<param name="dcache_config" value="32768,32,8,1, 4,6, 32,1 "/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="800000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<param name="number_of_BTB" value="2"/>
 			<component id="system.core0.BTB" name="BTB">
 			        <!-- all the buffer related are optional -->
 				<param name="BTB_config" value="5120,4,2,1, 1,3"/> <!--should be 4096 + 1024 -->
 				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 				<stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
 				<stat name="write_accesses" value="0"/>
 			</component>
 	</component>
 		<component id="system.L1Directory0" name="L1Directory0">
 				<param name="Directory_type" value="0"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="4096,2,0,1,100,100, 8"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="3400"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="800000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="20"/>	
 		</component>
 		<component id="system.L2Directory0" name="L2Directory0">
 				<param name="Directory_type" value="1"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="3400"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="58824"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="100"/>	
 		</component>
 		<component id="system.L20" name="L20">
 			<!-- all the buffer related are optional -->
 				<param name="L2_config" value="6291456,64, 16, 8, 8, 23, 32, 1"/> 
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<param name="clockrate" value="3700"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<stat name="read_accesses" value="200000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			    <stat name="duty_cycle" value="1.0"/>	
 		</component>
 <!--**********************************************************************-->
 <component id="system.L30" name="L30">
 				<param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 				<param name="clockrate" value="850"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="11824"/>
 				<stat name="write_accesses" value="11276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 				<stat name="duty_cycle" value="1.0"/>	
 		</component>
 <!--**********************************************************************-->
 		<component id="system.NoC0" name="noc0">
 			<param name="clockrate" value="3400"/>
 			<param name="type" value="0"/>
 			<!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
 				at each time only one node can send req -->
 			<param name="horizontal_nodes" value="1"/>
 			<param name="vertical_nodes" value="1"/>
 			<param name="has_global_link" value="0"/>
 			<!-- 1 has global link, 0 does not have global link -->
 			<param name="link_throughput" value="1"/><!--w.r.t clock -->
 			<param name="link_latency" value="1"/><!--w.r.t clock -->
 			<!-- througput >= latency -->
 			<!-- Router architecture -->
 			<param name="input_ports" value="1"/>
 			<param name="output_ports" value="1"/>
 			<!-- For bus the I/O ports should be 1 -->
 			<param name="flit_bits" value="256"/>
 			<param name="chip_coverage" value="1"/>
 			<!-- When multiple NOC present, one NOC will cover part of the whole chip. 
 				chip_coverage <=1 -->
 			<param name="link_routing_over_percentage" value="0.5"/>
 			<!-- Links can route over other components or occupy whole area.
 				by default, 50% of the NoC global links routes over other 
 				components -->
 			<stat name="total_accesses" value="100000"/>
 			<!-- This is the number of total accesses within the whole network not for each router -->
 			<stat name="duty_cycle" value="1"/>
 		</component>		
 <!--**********************************************************************-->
 		<component id="system.mem" name="mem">
 			<!-- Main memory property -->
 			<param name="mem_tech_node" value="32"/>
 			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
 			<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
 			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
 			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
 			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
 			<!-- above numbers can be easily found from Wikipedia -->
 			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
 			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
 			Current McPAT assumes single DIMMs are used.--> 		
 			<param name="number_ranks" value="2"/>
 			<param name="num_banks_of_DRAM_chip" value="8"/>			
 			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
 			<param name="output_width_of_DRAM_chip" value="8"/>
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
 			<param name="burstlength_of_DRAM_chip" value="8"/>
 			<stat name="memory_accesses" value="1052"/>
 			<stat name="memory_reads" value="1052"/>
 			<stat name="memory_writes" value="1052"/>									
 		</component>
 		<component id="system.mc" name="mc">
 			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
 			<!-- current version of McPAT uses published values for base parameters of memory controller
 			improvments on MC will be added in later versions. -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
 			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
 			<param name="block_size" value="64"/><!--B-->
 			<param name="number_mcs" value="0"/>
 			<!-- current McPAT only supports homogeneous memory controllers -->
 			<param name="memory_channels_per_mc" value="1"/>
 			<param name="number_ranks" value="2"/>
 			<param name="withPHY" value="0"/>
 			<!-- # of ranks of each channel-->
 			<param name="req_window_size_per_channel" value="32"/>
 			<param name="IO_buffer_size_per_channel" value="32"/>
 			<param name="databus_width" value="128"/>
 			<param name="addressbus_width" value="51"/>
 			<!-- McPAT will add the control bus width to the addressbus width automatically -->
 			<stat name="memory_accesses" value="33333"/>
 			<stat name="memory_reads" value="16667"/>
 			<stat name="memory_writes" value="16667"/>
 			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
 			the average power per MC or per channel. This is sufficent for most application. 
 			Further trackdown can be easily added in later versions. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.niu" name="niu">
 			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
 			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
 				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
 			the average power per nic or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.pcie" name="pcie">
 			<!-- On chip PCIe controller, including Phy-->
 			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
 				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="withPHY" value="1"/>
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="0"/>
 			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
 			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.flashc" name="flashc">
 		    <param name="number_flashcs" value="0"/>
 			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
            <param name="withPHY" value="1"/>
 			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
 			the average power per fc or per channel. This is sufficent for most application -->  			
 		</component>
 <!--**********************************************************************-->
 		</component>
 </component>
--- a/ext/mcpat/README
+++ b/ext/mcpat/README
@ -0,0 +1,226 @@
 __  __      ____   _  _____   ____       _         
 |  \/  | ___|  _ \ / \|_   _| | __ )  ___| |_  __ _ 
 | |\/| |/ __| |_) / _ \ | |   |  _ \ / _ \ __|/ _` |
 | |  | | (__|  __/ ___ \| |   | |_) |  __/ |_| (_| |
 |_|  |_|\___|_| /_/   \_\_|   |____/ \___|\__|\__,_|
 McPAT: Multicore Power, Area, and Timing
 Current version 0.8Beta 
 ===============================
 McPAT is an architectural modeling tool for chip multiprocessors (CMP)
 The main focus of McPAT is accurate power and area
 modeling, and a target clock rate is used as a design constraint. 
 McPAT performs automatic extensive search to find optimal designs 
 that satisfy the target clock frequency.  
 For complete documentation of the McPAT, please refer McPAT 1.0
 technical report and the following paper,
 "McPAT: An Integrated Power, Area, and Timing Modeling
 Framework for Multicore and Manycore Architectures", 
 that appears in MICRO 2009. Please cite the paper, if you use
 McPAT in your work. The bibtex entry is provided below for your convenience.
 @inproceedings{mcpat:micro,
 author = {Sheng Li and Jung Ho Ahn and Richard D. Strong and Jay B. Brockman and Dean M. Tullsen and Norman P. Jouppi},
 title =  "{McPAT: An Integrated Power, Area, and Timing Modeling Framework for Multicore and Manycore Architectures}",
 booktitle = {MICRO 42: Proceedings of the 42nd Annual IEEE/ACM International Symposium on Microarchitecture},
 year = {2009},
 pages = {469--480},
 }
 Current McPAT is in its beta release. 
 List of features of beta release
 ===============================
 The following are the list of features supported by the tool. 
 * Power, area, and timing models for CMPs with:
      Inorder cores both single and multithreaded
      OOO cores both single and multithreaded
      Shared/coherent caches with directory hardware:
      	including directory cache, shadowed tag directory
      	and static bank mapped tag directory
      Network-on-Chip
      On-chip memory controllers
 * Internal models are based on real modern processors:
  Inorder models are based on Sun Niagara family
  OOO models are based on Intel P6 for reservation 
  station based OOO cores, and on Intel Netburst and 
  Alpha 21264 for physical register file based OOO cores.     
 * Leakage power modeling considers both sub-threshold leakage 
  and gate leakage power. The impact of operating temperature 
  on both leakage power are considered. Longer channel devices 
  that can reduce leakage significantly with modest performance 
  penalty are also modeled.
 * McPAT supports automatic extensive search to find optimal designs 
  that satisfy the target clock frequency. The timing constraint 
  include both throughput and latency.
 * Interconnect model with different delay, power, and area 
  properties, as well as both the aggressive and conservative 
  interconnect projections on wire technologies. 
 * All process specific values used by the McPAT are obtained
  from ITRS and currently, the McPAT supports 90nm, 65nm, 45nm, 
  32nm, and 22nm technology nodes. At 32nm and 22nm nodes, SOI 
  and DG devices are used. After 45nm, Hi-K metal gates are used.
 How to use the tool?
 ====================
 McPAT takes input parameters from an XML-based interface,
 then it computes area and peak power of the 
 Please note that the peak power is the absolute worst case power, 
 which could be even higher than TDP. 
 1. Steps to run McPAT:
   -> define the target processor using inorder.xml or OOO.xml 
   -> run the "mcpat" binary:
      ./mcpat -infile <*.xml>  -print_level < level of detailed output>
      ./mcpat -h (or mcpat --help) will show the quick help message.
   Rather than being hardwired to certain simulators, McPAT 
   uses an XML-based interface to enable easy integration
   with various performance simulators. Our collaborator, 
   Richard Strong, at University of California, San Diego, 
   designed an experimental parser for the M5 simulator, aiming for 
   streamlining the integration of McPAT and M5. Please check the M5 
   repository/ for the latest version of the parser.
 2. Optimize:
   McPAT will try its best to satisfy the target clock rate. 
   When it cannot find a valid solution, it gives out warnings, 
   while still giving a solution that is closest to the timing 
   constraints and calculate power based on it. The optimization 
   will lead to larger power/area numbers for target higher clock
   rate. McPAT also provides the option "-opt_for_clk" to turn on 
   ("-opt_for_clk 1") and off this strict optimization for the 
   timing constraint. When it is off, McPAT always optimize 
   component for ED^2P without worrying about meeting the 
   target clock frequency. By turning it off, the computation time 
   can be reduced, which suites for situations where target clock rate
   is conservative.
 3. The output:
   McPAT outputs results in a hierarchical manner. Increasing 
   the "-print_level" will show detailed results inside each 
   component. For each component, major parts are shown, and associated 
   pipeline registers/control logic are added up in total area/power of each 
   components. In general, McPAT does not model the area/overhead of the pad 
   frame used in a processor die.
 4. How to use the XML interface for McPAT 
   4.1 Set up the parameters
   		Parameters of target designs need to be set in the *.xml file for 
   		entries taged as "param". McPAT have very detailed parameter settings. 
   		please remove the structure parameter from the file if you want 
   		to use the default values. Otherwise, the parameters in the xml file 
   		will override the default values. 
   4.2 Pass the statistics
   		There are two options to get the correct stats: a) the performance 
   		simulator can capture all the stats in detail and pass them to McPAT;
   		b). Performance simulator can only capture partial stats and pass 
   		them to McPAT, while McPAT can reason about the complete stats using 
        the partial information and the configuration. Therefore, there are 
        some overlap for the stats. 
   4.3 Interface XML file structures (PLEASE READ!)
   			The XML is hierarchical from processor level to micro-architecture 
   		level. McPAT support both heterogeneous and homogeneous manycore processors. 
   			1). For heterogeneous processor setup, each component (core, NoC, cache, 
   		and etc) must have its own instantiations (core0, core1, ..., coreN). 
   		Each instantiation will have different parameters as well as its stats.
   		Thus, the XML file must have multiple "instantiation" of each type of 
   		heterogeneous components and the corresponding hetero flags must be set 
   		in the XML file. Then state in the XML should be the stats of "a" instantiation 
   		(e.g. "a" cores). The reported runtime dynamic is of a single instantiation 
   		(e.g. "a" cores). Since the stats for each (e.g. "a" cores) may be different,
   		we will see a whole list of (e.g. "a" cores) with different dynamic power,
   		and total power is just a sum of them.  
   			2). For homogeneous processors, the same method for heterogeneous can 
   		also be used by treating all homogeneous instantiations as heterogeneous. 
   		However, a preferred approach is to use a single representative for all 
   		the same components (e.g. core0 to represent all cores) and set the 
   		processor to have homogeneous components (e.g. <param name="homogeneous_cores
   		" value="1"/> ). Thus, the XML file only has one instantiation to represent 
   		all others with the same architectural parameters. The corresponding homo 
   		flags must be set in the XML file.  Then, the stats in the XML should be 
   		the aggregated stats of the sum of all instantiations (e.g. aggregated stats 
   		of all cores). In the final results, McPAT will only report a single 
   		instantiation of each type of component, and the reported runtime dynamic power
   		is the sum of all instantiations of the same type. This approach can run fast 
   		and use much less memory.        
 5. Guide for integrating McPAT into performance simulators and bypassing the XML interface
   		The detailed work flow of McPAT has two phases: the initialization phase and
   the computation phase. Specifically, in order to start the initialization phase a 
   user specifies static configurations, including parameters at all three levels, 
   namely, architectural, circuit, and technology levels. During the initialization 
   phase, McPAT will generate the internal chip representation using the configurations 
   set by the user. 
   		The computation phase of McPAT is called by McPAT or the performance simulator 
   during simulation to generate runtime power numbers. Before calling McPAT to 
   compute runtime power numbers, the performance simulator needs to pass the 
   statistics, namely, the activity factors of each individual components to McPAT 
   via the XML interface. 
   		The initialization phase is very time-consuming, since it will repeat many 
   times until valid configurations are found or the possible configurations are 
   exhausted. To reduce the overhead, a user can let the simulator to call McPAT 
   directly for computation phase and only call initialization phase once at the 
   beginning of simulation. In this case, the XML interface file is bypassed, 
   please refer to processor.cc to see how the two phases are called.
 6. Sample input files:
   This package provide sample XML files for validating target processors. Please find the 
   enclosed Niagara1.xml (for the Sun Niagara1 processor), Niagara2.xml (for the Sun Niagara2 
   processor), Alpha21364.xml (for the Alpha21364 processor), and Xeon.xml (for the Intel 
   Xeon Tulsa processor). 
   Special instructions for using Xeon.xml:
   McPAT uses ITRS device types including HP, LSTP, and LOP. Although most 
   designs follow ITRS projections, there are designs with special technologies. 
   For example, the 65nm Xeon Tulsa processor uses 1.25 V rather than 1.1V 
   for the core voltage domain, which results in the changes in threshold voltage,
   leakage current density, saturation current, and etc, besides the different 
   supply voltage. We use MASTAR to match the special technology as used in Xeon 
   core domain. Therefore, in order to generate accurate results of Xeon 
   Tulsa cores, users need to do make TAR=mcpatXeonCore and use the generated 
   special executable. The L3 cache and buses must be computed using standard 
   ITRS technology.    
 ====================
 McPAT is in its beginning stage. We are still improving 
 the tool and refining the code. Please come back to its website 
 for newer versions. If you have any comments, 
 questions, or suggestions, please write to us.
 Version history and roadmap
 McPAT Alpha:      released Sep. 2009 Experimental release
 McPAT Beta (0.6): released Nov. 2009 New code base and technology base
 McPAT Beta (0.7): released May. 2010 Added various new models, 
                  including long channel devices, buses model; together
                  with bug fixes and extensive code optimization to reduce 
                  memory usage.  
 McPAT Beta (0.8): released Aug. 2010 Added various new models, 
                  including on-chip 10Gb ethernet units, PCIe, and flash controllers.
 Next major release:     
 McPAT 1.0:        including advance power-saving states
 Future releases may include the modeling of embedded low-power 
 processors as well as vector processors and GPGPUs.             
 Sheng Li             
 sheng.li@hp.com 
--- a/ext/mcpat/XML_Parse.cc
+++ b/ext/mcpat/XML_Parse.cc
--- a/ext/mcpat/XML_Parse.h
+++ b/ext/mcpat/XML_Parse.h
@ -0,0 +1,591 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef XML_PARSE_H_
 #define XML_PARSE_H_
 //#ifdef WIN32
 //#define _CRT_SECURE_NO_DEPRECATE
 //#endif
 #include <stdio.h>
 #include <string.h>
 #include <iostream>
 #include "xmlParser.h"
 using namespace std;
 /*
 void myfree(char *t); // {free(t);}
 ToXMLStringTool tx,tx2;
 */
 //all subnodes at the level of system.core(0-n)
 //cache_policy is added into cache property arrays;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
 typedef struct{
        int prediction_width;
        char prediction_scheme[20];
        int predictor_size;
        int predictor_entries;
        int local_predictor_size[20];
        int local_predictor_entries;
        int global_predictor_entries;
        int global_predictor_bits;
        int chooser_predictor_entries;
        int chooser_predictor_bits;
        double predictor_accesses;
 } predictor_systemcore;
 typedef struct{
        int number_entries;
        int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
        double total_hits;
        double total_accesses;
        double total_misses;
        double conflicts;
 } itlb_systemcore;
 typedef struct{
        //params
        double icache_config[20];
        int buffer_sizes[20];
        int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
        //stats
        double total_accesses;
        double read_accesses;
        double read_misses;
        double replacements;
        double read_hits;
        double total_hits;
        double total_misses;
        double miss_buffer_access;
        double fill_buffer_accesses;
        double prefetch_buffer_accesses;
        double prefetch_buffer_writes;
        double prefetch_buffer_reads;
        double prefetch_buffer_hits;
        double conflicts;
 } icache_systemcore;
 typedef struct{
        //params
        int number_entries;
        int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
        //stats
        double total_accesses;
        double read_accesses;
        double write_accesses;
        double write_hits;
        double read_hits;
        double read_misses;
        double write_misses;
        double total_hits;
        double total_misses;
        double conflicts;
 } dtlb_systemcore;
 typedef struct{
        //params
        double dcache_config[20];
        int buffer_sizes[20];
        int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
        //stats
        double total_accesses;
        double read_accesses;
        double write_accesses;
        double total_hits;
        double total_misses;
        double read_hits;
        double write_hits;
        double read_misses;
        double write_misses;
        double replacements;
        double write_backs;
        double miss_buffer_access;
        double fill_buffer_accesses;
        double prefetch_buffer_accesses;
        double prefetch_buffer_writes;
        double prefetch_buffer_reads;
        double prefetch_buffer_hits;
        double wbb_writes;
        double wbb_reads;
        double conflicts;
 } dcache_systemcore;
 typedef struct{
        //params
        int BTB_config[20];
        //stats
        double total_accesses;
        double read_accesses;
        double write_accesses;
        double total_hits;
        double total_misses;
        double read_hits;
        double write_hits;
        double read_misses;
        double write_misses;
        double replacements;
 } BTB_systemcore;
 typedef struct{
        //all params at the level of system.core(0-n)
        int clock_rate;
        bool opt_local;
        bool x86;
        int machine_bits;
        int virtual_address_width;
        int physical_address_width;
        int opcode_width;
        int micro_opcode_width;
        int instruction_length;
        int machine_type;
        int internal_datapath_width;
        int number_hardware_threads;
        int fetch_width;
        int number_instruction_fetch_ports;
        int decode_width;
        int issue_width;
        int peak_issue_width;
        int commit_width;
        int pipelines_per_core[20];
        int pipeline_depth[20];
        char FPU[20];
        char divider_multiplier[20];
        int ALU_per_core;
        double FPU_per_core;
        int MUL_per_core;
        int instruction_buffer_size;
        int decoded_stream_buffer_size;
        int instruction_window_scheme;
        int instruction_window_size;
        int fp_instruction_window_size;
        int ROB_size;
        int archi_Regs_IRF_size;
        int archi_Regs_FRF_size;
        int phy_Regs_IRF_size;
        int phy_Regs_FRF_size;
        int rename_scheme;
        int register_windows_size;
        char LSU_order[20];
        int store_buffer_size;
        int load_buffer_size;
        int memory_ports;
        char Dcache_dual_pump[20];
        int RAS_size;
        int fp_issue_width;
        int prediction_width;
        int number_of_BTB;
        int number_of_BPT;
        //all stats at the level of system.core(0-n)
        double total_instructions;
        double int_instructions;
        double fp_instructions;
        double branch_instructions;
        double branch_mispredictions;
        double committed_instructions;
        double committed_int_instructions;
        double committed_fp_instructions;
        double load_instructions;
        double store_instructions;
        double total_cycles;
        double idle_cycles;
        double busy_cycles;
        double instruction_buffer_reads;
        double instruction_buffer_write;
        double ROB_reads;
        double ROB_writes;
        double rename_accesses;
        double fp_rename_accesses;
        double rename_reads;
        double rename_writes;
        double fp_rename_reads;
        double fp_rename_writes;
        double inst_window_reads;
        double inst_window_writes;
        double inst_window_wakeup_accesses;
        double inst_window_selections;
        double fp_inst_window_reads;
        double fp_inst_window_writes;
        double fp_inst_window_wakeup_accesses;
        double fp_inst_window_selections;
        double archi_int_regfile_reads;
        double archi_float_regfile_reads;
        double phy_int_regfile_reads;
        double phy_float_regfile_reads;
        double phy_int_regfile_writes;
        double phy_float_regfile_writes;
        double archi_int_regfile_writes;
        double archi_float_regfile_writes;
        double int_regfile_reads;
        double float_regfile_reads;
        double int_regfile_writes;
        double float_regfile_writes;
        double windowed_reg_accesses;
        double windowed_reg_transports;
        double function_calls;
        double context_switches;
        double ialu_accesses;
        double fpu_accesses;
        double mul_accesses;
        double cdb_alu_accesses;
        double cdb_mul_accesses;
        double cdb_fpu_accesses;
        double load_buffer_reads;
        double load_buffer_writes;
        double load_buffer_cams;
        double store_buffer_reads;
        double store_buffer_writes;
        double store_buffer_cams;
        double store_buffer_forwards;
        double main_memory_access;
        double main_memory_read;
        double main_memory_write;
        double pipeline_duty_cycle;
        double IFU_duty_cycle ;
        double BR_duty_cycle ;
        double LSU_duty_cycle ;
        double MemManU_I_duty_cycle;
        double MemManU_D_duty_cycle ;
        double ALU_duty_cycle ;
        double MUL_duty_cycle ;
        double FPU_duty_cycle ;
        double ALU_cdb_duty_cycle ;
        double MUL_cdb_duty_cycle ;
        double FPU_cdb_duty_cycle ;
        //all subnodes at the level of system.core(0-n)
        predictor_systemcore predictor;
        itlb_systemcore itlb;
        icache_systemcore icache;
        dtlb_systemcore dtlb;
        dcache_systemcore dcache;
        BTB_systemcore BTB;
 } system_core;
 typedef struct{
        //params
        int Directory_type;
        double Dir_config[20];
        int buffer_sizes[20];
        int clockrate;
        int ports[20];
        int device_type;
        int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
        char threeD_stack[20];
        //stats
        double total_accesses;
        double read_accesses;
        double write_accesses;
        double read_misses;
        double write_misses;
        double conflicts;
        double duty_cycle;
 } system_L1Directory;
 typedef struct{
        //params
        int Directory_type;
        double Dir_config[20];
        int buffer_sizes[20];
        int clockrate;
        int ports[20];
        int device_type;
        int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
        char threeD_stack[20];
        //stats
        double total_accesses;
        double read_accesses;
        double write_accesses;
        double read_misses;
        double write_misses;
        double conflicts;
        double duty_cycle;
 } system_L2Directory;
 typedef struct{
        //params
        double L2_config[20];
        int clockrate;
        int ports[20];
        int device_type;
        int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
        char threeD_stack[20];
        int buffer_sizes[20];
        //stats
        double total_accesses;
        double read_accesses;
        double write_accesses;
        double total_hits;
        double total_misses;
        double read_hits;
        double write_hits;
        double read_misses;
        double write_misses;
        double replacements;
        double write_backs;
        double miss_buffer_accesses;
        double fill_buffer_accesses;
        double prefetch_buffer_accesses;
        double prefetch_buffer_writes;
        double prefetch_buffer_reads;
        double prefetch_buffer_hits;
        double wbb_writes;
        double wbb_reads;
        double conflicts;
        double duty_cycle;
        bool   merged_dir;
        double homenode_read_accesses;
        double homenode_write_accesses;
        double homenode_read_hits;
        double homenode_write_hits;
        double homenode_read_misses;
        double homenode_write_misses;
        double dir_duty_cycle;
 } system_L2;
 typedef struct{
        //params
        double L3_config[20];
        int clockrate;
        int ports[20];
        int device_type;
        int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
        char threeD_stack[20];
        int buffer_sizes[20];
        //stats
        double total_accesses;
        double read_accesses;
        double write_accesses;
        double total_hits;
        double total_misses;
        double read_hits;
        double write_hits;
        double read_misses;
        double write_misses;
        double replacements;
        double write_backs;
        double miss_buffer_accesses;
        double fill_buffer_accesses;
        double prefetch_buffer_accesses;
        double prefetch_buffer_writes;
        double prefetch_buffer_reads;
        double prefetch_buffer_hits;
        double wbb_writes;
        double wbb_reads;
        double conflicts;
        double duty_cycle;
        bool   merged_dir;
        double homenode_read_accesses;
        double homenode_write_accesses;
        double homenode_read_hits;
        double homenode_write_hits;
        double homenode_read_misses;
        double homenode_write_misses;
        double dir_duty_cycle;
 } system_L3;
 typedef struct{
        //params
        int number_of_inputs_of_crossbars;
        int number_of_outputs_of_crossbars;
        int flit_bits;
        int input_buffer_entries_per_port;
        int ports_of_input_buffer[20];
        //stats
        double crossbar_accesses;
 } xbar0_systemNoC;
 typedef struct{
        //params
        int clockrate;
        bool type;
        bool has_global_link;
        char topology[20];
        int horizontal_nodes;
        int vertical_nodes;
        int link_throughput;
        int link_latency;
        int input_ports;
        int output_ports;
        int virtual_channel_per_port;
        int flit_bits;
        int input_buffer_entries_per_vc;
        int ports_of_input_buffer[20];
        int dual_pump;
        int number_of_crossbars;
        char crossbar_type[20];
        char crosspoint_type[20];
        xbar0_systemNoC xbar0;
        int arbiter_type;
        double chip_coverage;
        //stats
        double total_accesses;
        double duty_cycle;
        double route_over_perc;
 } system_NoC;
 typedef struct{
        //params
        int mem_tech_node;
        int device_clock;
        int peak_transfer_rate;
        int internal_prefetch_of_DRAM_chip;
        int capacity_per_channel;
        int number_ranks;
        int num_banks_of_DRAM_chip;
        int Block_width_of_DRAM_chip;
        int output_width_of_DRAM_chip;
        int page_size_of_DRAM_chip;
        int burstlength_of_DRAM_chip;
        //stats
        double memory_accesses;
        double memory_reads;
        double memory_writes;
 } system_mem;
 typedef struct{
        //params
    //Common Param for mc and fc
        double peak_transfer_rate;
        int number_mcs;
        bool withPHY;
        int type;
        //FCParam
        //stats
        double duty_cycle;
        double total_load_perc;
        //McParam
        int mc_clock;
    int llc_line_length;
        int memory_channels_per_mc;
        int number_ranks;
        int req_window_size_per_channel;
        int IO_buffer_size_per_channel;
        int databus_width;
        int addressbus_width;
        bool LVDS;
        //stats
        double memory_accesses;
        double memory_reads;
        double memory_writes;
 } system_mc;
 typedef struct{
        //params
    int clockrate;
        int number_units;
        int type;
        //stats
        double duty_cycle;
        double total_load_perc;
 } system_niu;
 typedef struct{
        //params
    int clockrate;
        int number_units;
        int num_channels;
        int type;
        bool withPHY;
        //stats
        double duty_cycle;
        double total_load_perc;
 } system_pcie;
 typedef struct{
        //All number_of_* at the level of 'system' Ying 03/21/2009
        int number_of_cores;
        int number_of_L1Directories;
        int number_of_L2Directories;
        int number_of_L2s;
        bool Private_L2;
        int number_of_L3s;
        int number_of_NoCs;
        int number_of_dir_levels;
    int domain_size;
    int first_level_dir;
        // All params at the level of 'system'
        int homogeneous_cores;
        int homogeneous_L1Directories;
        int homogeneous_L2Directories;
        double core_tech_node;
        int target_core_clockrate;
        int target_chip_area;
        int temperature;
        int number_cache_levels;
        int L1_property;
        int L2_property;
        int homogeneous_L2s;
        int L3_property;
        int homogeneous_L3s;
        int homogeneous_NoCs;
        int homogeneous_ccs;
        int Max_area_deviation;
        int Max_power_deviation;
        int device_type;
        bool longer_channel_device;
        bool Embedded;
        bool opt_dynamic_power;
        bool opt_lakage_power;
        bool opt_clockrate;
        bool opt_area;
        int interconnect_projection_type;
        int machine_bits;
        int virtual_address_width;
        int physical_address_width;
        int virtual_memory_page_size;
    double total_cycles;
        //system.core(0-n):3rd level
        system_core core[64];
        system_L1Directory L1Directory[64];
        system_L2Directory L2Directory[64];
        system_L2 L2[64];
        system_L3 L3[64];
    system_NoC NoC[64];
    system_mem mem;
        system_mc mc;
        system_mc flashc;
        system_niu niu;
        system_pcie pcie;
 } root_system;
 class ParseXML
 {
 public:
        void parse(char* filepath);
    void initialize();
 public:
        root_system sys;
 };
 #endif /* XML_PARSE_H_ */
--- a/ext/mcpat/Xeon.xml
+++ b/ext/mcpat/Xeon.xml
@ -0,0 +1,455 @@
 <?xml version="1.0" ?>
 <component id="root" name="root">
 	<component id="system" name="system">
 		<!--McPAT will skip the components if number is set to 0 -->
 		<param name="number_of_cores" value="2"/>
 		<param name="number_of_L1Directories" value="0"/>
 		<param name="number_of_L2Directories" value="0"/>
 		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
 		<param name="Private_L2" value="1"/><!--1 Private, 0 shared/coherent -->
 		<param name="number_of_L3s" value="1"/> <!-- This number means how many L3 clusters -->
 		<param name="number_of_NoCs" value="1"/>
 		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
 		<param name="homogeneous_L2s" value="1"/>
 		<param name="homogeneous_L1Directorys" value="1"/>
 		<param name="homogeneous_L2Directorys" value="1"/>
 		<param name="homogeneous_L3s" value="1"/>
 		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
 		<param name="homogeneous_NoCs" value="1"/>
 		<param name="core_tech_node" value="65"/><!-- nm -->
 		<param name="target_core_clockrate" value="3400"/><!--MHz -->
 		<param name="temperature" value="380"/> <!-- Kelvin -->
 		<param name="number_cache_levels" value="3"/>
 		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
 		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
 		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate -->
 		<param name="machine_bits" value="64"/>
 		<param name="virtual_address_width" value="64"/>
 		<param name="physical_address_width" value="52"/>
 		<param name="virtual_memory_page_size" value="4096"/>
 		<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
 			default value is machine_bits, if not set --> 
 		<stat name="total_cycles" value="100000"/>
 		<stat name="idle_cycles" value="0"/>
 		<stat name="busy_cycles"  value="100000"/>
 			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
 			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
 		<!-- *********************** cores ******************* -->
 		<component id="system.core0" name="core0">
 			<!-- Core property -->
 			<param name="clock_rate" value="3400"/>
 			<!-- for cores with unknow timing, set to 0 to force off the opt flag -->
 			<param name="opt_local" value="0"/>
 			<param name="instruction_length" value="32"/>
 			<param name="opcode_width" value="16"/>
 			<param name="x86" value="1"/>
 			<param name="micro_opcode_width" value="8"/>
 			<param name="machine_type" value="0"/>
 			<!-- inorder/OoO; 1 inorder; 0 OOO-->
 			<param name="number_hardware_threads" value="2"/>
 			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
 			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
 			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
 			<param name="fetch_width" value="4"/>
 			<!-- fetch_width determins the size of cachelines of L1 cache block -->
 			<param name="number_instruction_fetch_ports" value="1"/>
 			<param name="decode_width" value="4"/>
 			<!-- decode_width determins the number of ports of the 
 			renaming table (both RAM and CAM) scheme -->
 			<param name="issue_width" value="4"/>
 			<param name="peak_issue_width" value="6"/>
 			<!-- issue_width determins the number of ports of Issue window and other logic 
 			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
 			<param name="commit_width" value="4"/>
 			<!-- commit_width determins the number of ports of register files -->
 			<param name="fp_issue_width" value="2"/>
 			<param name="prediction_width" value="1"/> 
 			<!-- number of branch instructions can be predicted simultannouesl-->
 			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
 			Theses parameters are reserved for future use.--> 
 			<param name="pipelines_per_core" value="1,1"/>
 			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
 			<param name="pipeline_depth" value="31,31"/>
 			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
 			<!-- issue and exe unit-->
 			<param name="ALU_per_core" value="6"/>
 			<!-- contains an adder, a shifter, and a logical unit -->
 			<param name="MUL_per_core" value="1"/>
 			<!-- For MUL and Div -->
 			<param name="FPU_per_core" value="2"/>		
 			<!-- buffer between IF and ID stage -->
 			<param name="instruction_buffer_size" value="32"/>
 			<!-- buffer between ID and sche/exe stage -->
 			<param name="decoded_stream_buffer_size" value="16"/>
 			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
 			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
 			<param name="instruction_window_size" value="64"/>
 			<param name="fp_instruction_window_size" value="64"/>
 			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
 			<param name="ROB_size" value="128"/>
 			<!-- each in-flight instruction has an entry in ROB -->
 			<!-- registers -->
 			<param name="archi_Regs_IRF_size" value="16"/><!-- X86-64 has 16GPR -->			
 			<param name="archi_Regs_FRF_size" value="32"/><!-- MMX + XMM -->
 			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
 			renaming logic is for both integer and floating point insts.  -->
 			<param name="phy_Regs_IRF_size" value="256"/>
 			<param name="phy_Regs_FRF_size" value="256"/>
 			<!-- rename logic -->
 			<param name="rename_scheme" value="0"/>
 			<!-- can be RAM based(0) or CAM based(1) rename scheme 
 			RAM-based scheme will have free list, status table;
 			CAM-based scheme have the valid bit in the data field of the CAM 
 			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
 			Detailed RAT Implementation see TR -->
 			<param name="register_windows_size" value="0"/>
 			<!-- how many windows in the windowed register file, sun processors;
 			no register windowing is used when this number is 0 -->
 			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
 			They will always try to exeute out-of-order though. -->
 			<param name="LSU_order" value="inorder"/>
 			<param name="store_buffer_size" value="96"/>
 			<!-- By default, in-order cores do not have load buffers -->
 			<param name="load_buffer_size" value="48"/>	
 			<!-- number of ports refer to sustainable concurrent memory accesses --> 
 			<param name="memory_ports" value="2"/>	
 			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
 			as well as the ports of Dcache which is connected to LSU -->	
 			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
 			<param name="RAS_size" value="64"/>						
 			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
 			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
 			<stat name="total_instructions" value="400000"/>
 			<stat name="int_instructions" value="200000"/>
 			<stat name="fp_instructions" value="100000"/>
 			<stat name="branch_instructions" value="100000"/>
 			<stat name="branch_mispredictions" value="0"/>
 			<stat name="load_instructions" value="0"/>
 			<stat name="store_instructions" value="50000"/>
 			<stat name="committed_instructions" value="400000"/>
 			<stat name="committed_int_instructions" value="200000"/>
 			<stat name="committed_fp_instructions" value="100000"/>
 			<stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
 			<!-- the following cycle stats are used for heterogeneouse cores only, 
 				please ignore them if homogeneouse cores -->
 			<stat name="total_cycles" value="100000"/>
 		    <stat name="idle_cycles" value="0"/>
 		    <stat name="busy_cycles"  value="100000"/>
 			<!-- instruction buffer stats -->
 			<!-- ROB stats, both RS and Phy based OoOs have ROB
 			performance simulator should capture the difference on accesses,
 			otherwise, McPAT has to guess based on number of commited instructions. -->
 			<stat name="ROB_reads" value="400000"/>
 			<stat name="ROB_writes" value="400000"/>
 			<!-- RAT accesses -->
 			<stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
 			<stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
 			<stat name="fp_rename_reads" value="200000"/>
 			<stat name="fp_rename_writes" value="100000"/>
 			<!-- decode and rename stage use this, should be total ic - nop -->
 			<!-- Inst window stats -->
 			<stat name="inst_window_reads" value="400000"/>
 			<stat name="inst_window_writes" value="400000"/>
 			<stat name="inst_window_wakeup_accesses" value="800000"/>
 			<stat name="fp_inst_window_reads" value="200000"/>
 			<stat name="fp_inst_window_writes" value="200000"/>
 			<stat name="fp_inst_window_wakeup_accesses" value="400000"/>
 			<!--  RF accesses -->
 			<stat name="int_regfile_reads" value="600000"/>
 			<stat name="float_regfile_reads" value="100000"/>
 			<stat name="int_regfile_writes" value="300000"/>
 			<stat name="float_regfile_writes" value="50000"/>
 			<!-- accesses to the working reg -->
 			<stat name="function_calls" value="5"/>
 			<stat name="context_switches" value="260343"/>
 			<!-- Number of Windowes switches (number of function calls and returns)-->
 			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
 			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
 			<stat name="ialu_accesses" value="300000"/>			
 			<stat name="fpu_accesses" value="100000"/>
 			<stat name="mul_accesses" value="200000"/>
 			<stat name="cdb_alu_accesses" value="300000"/>
 			<stat name="cdb_mul_accesses" value="200000"/>
 			<stat name="cdb_fpu_accesses" value="100000"/>
 			<!-- multiple cycle accesses should be counted multiple times, 
 			otherwise, McPAT can use internal counter for different floating point instructions 
 			to get final accesses. But that needs detailed info for floating point inst mix -->
 			<!--  currently the performance simulator should 
 			make sure all the numbers are final numbers, 
 			including the explicit read/write accesses, 
 			and the implicite accesses such as replacements and etc.
 			Future versions of McPAT may be able to reason the implicite access
 			based on param and stats of last level cache
 			The same rule applies to all cache access stats too!  -->
 			<!-- following is AF for max power computation. 
 				Do not change them, unless you understand them-->
 			<stat name="IFU_duty_cycle" value="1"/>			
 			<stat name="LSU_duty_cycle" value="0.5"/>
 			<stat name="MemManU_I_duty_cycle" value="1"/>
 			<stat name="MemManU_D_duty_cycle" value="0.5"/>
 			<stat name="ALU_duty_cycle" value="1"/>
 			<stat name="MUL_duty_cycle" value="0.3"/>
 			<stat name="FPU_duty_cycle" value="0.3"/>
 			<stat name="ALU_cdb_duty_cycle" value="1"/>
 			<stat name="MUL_cdb_duty_cycle" value="0.3"/>
 			<stat name="FPU_cdb_duty_cycle" value="0.3"/>
 			<param name="number_of_BPT" value="2"/>
 			<component id="system.core0.predictor" name="PBT">
 				<!-- branch predictor; tournament predictor see Alpha implementation -->
 				<param name="local_predictor_size" value="10,3"/>
 				<param name="local_predictor_entries" value="1024"/>
 				<param name="global_predictor_entries" value="4096"/>
 				<param name="global_predictor_bits" value="2"/>
 				<param name="chooser_predictor_entries" value="4096"/>
 				<param name="chooser_predictor_bits" value="2"/>
 				<!-- These parameters can be combined like below in next version
 				<param name="load_predictor" value="10,3,1024"/>
 				<param name="global_predictor" value="4096,2"/>
 				<param name="predictor_chooser" value="4096,2"/>
 				-->
 			</component>
 			<component id="system.core0.itlb" name="itlb">
 				<param name="number_entries" value="128"/>
 				<stat name="total_accesses" value="200000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
 				which is actually a replacement -->
 			</component>
 			<component id="system.core0.icache" name="icache">
 				<!-- there is no write requests to itlb although writes happen to it after miss, 
 				which is actually a replacement -->
 				<param name="icache_config" value="131072,32,8,1,8,3,32,0"/>
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy,  -->
 				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
 				<param name="buffer_sizes" value="16, 16, 16,0"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
 				<stat name="read_accesses" value="200000"/>
 				<stat name="read_misses" value="0"/>
 				<stat name="conflicts" value="0"/>				
 			</component>
 			<component id="system.core0.dtlb" name="dtlb">
 				<param name="number_entries" value="128"/><!--dual threads-->
 				<stat name="total_accesses" value="400000"/>
 				<stat name="total_misses" value="4"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<component id="system.core0.dcache" name="dcache">
 			        <!-- all the buffer related are optional -->
 				<param name="dcache_config" value="16384,16,4,1, 3,3, 16,1 "/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="800000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			</component>
 			<param name="number_of_BTB" value="2"/>
 			<component id="system.core0.BTB" name="BTB">
 			        <!-- all the buffer related are optional -->
 				<param name="BTB_config" value="5120,4,2,1, 1,3"/> <!--should be 4096 + 1024 -->
 				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 				<stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
 				<stat name="write_accesses" value="0"/>
 			</component>
 	</component>
 		<component id="system.L1Directory0" name="L1Directory0">
 				<param name="Directory_type" value="0"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="4096,2,0,1,100,100, 8"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="3400"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="800000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="20"/>	
 		</component>
 		<component id="system.L2Directory0" name="L2Directory0">
 				<param name="Directory_type" value="1"/>
 			    <!--0 cam based shadowed tag. 1 directory cache -->	
 				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
 				<!-- all the buffer related are optional -->
 			    <param name="clockrate" value="3400"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw search ports -->
 				<param name="device_type" value="0"/>
 				<!-- altough there are multiple access types, 
 				Performance simulator needs to cast them into reads or writes
 				e.g. the invalidates can be considered as writes -->
 				<stat name="read_accesses" value="58824"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="100"/>	
 		</component>
 		<component id="system.L20" name="L20">
 			<!-- all the buffer related are optional -->
 				<param name="L2_config" value="1048576,32, 8, 8, 8, 23, 32, 1"/> 
 				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<param name="clockrate" value="3400"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<stat name="read_accesses" value="200000"/>
 				<stat name="write_accesses" value="27276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 			    <stat name="duty_cycle" value="1.0"/>	
 		</component>
 <!--**********************************************************************-->
 <component id="system.L30" name="L30">
 				<param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
 				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
 				<param name="clockrate" value="850"/>
 				<param name="ports" value="1,1,1"/>
 				<!-- number of r, w, and rw ports -->
 				<param name="device_type" value="0"/>
 				<param name="buffer_sizes" value="16, 16, 16, 16"/>
 				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
 				<stat name="read_accesses" value="11824"/>
 				<stat name="write_accesses" value="11276"/>
 				<stat name="read_misses" value="1632"/>
 				<stat name="write_misses" value="183"/>
 				<stat name="conflicts" value="0"/>	
 				<stat name="duty_cycle" value="1.0"/>	
 		</component>
 <!--**********************************************************************-->
 		<component id="system.NoC0" name="noc0">
 			<param name="clockrate" value="3400"/>
 			<param name="type" value="0"/>
 			<!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
 				at each time only one node can send req -->
 			<param name="horizontal_nodes" value="1"/>
 			<param name="vertical_nodes" value="1"/>
 			<param name="has_global_link" value="0"/>
 			<!-- 1 has global link, 0 does not have global link -->
 			<param name="link_throughput" value="1"/><!--w.r.t clock -->
 			<param name="link_latency" value="1"/><!--w.r.t clock -->
 			<!-- througput >= latency -->
 			<!-- Router architecture -->
 			<param name="input_ports" value="1"/>
 			<param name="output_ports" value="1"/>
 			<!-- For bus the I/O ports should be 1 -->
 			<param name="flit_bits" value="256"/>
 			<param name="chip_coverage" value="1"/>
 			<!-- When multiple NOC present, one NOC will cover part of the whole chip. 
 				chip_coverage <=1 -->
 			<param name="link_routing_over_percentage" value="0.5"/>
 			<!-- Links can route over other components or occupy whole area.
 				by default, 50% of the NoC global links routes over other 
 				components -->
 			<stat name="total_accesses" value="100000"/>
 			<!-- This is the number of total accesses within the whole network not for each router -->
 			<stat name="duty_cycle" value="1"/>
 		</component>		
 <!--**********************************************************************-->
 		<component id="system.mem" name="mem">
 			<!-- Main memory property -->
 			<param name="mem_tech_node" value="32"/>
 			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
 			<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
 			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
 			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
 			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
 			<!-- above numbers can be easily found from Wikipedia -->
 			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
 			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
 			Current McPAT assumes single DIMMs are used.--> 		
 			<param name="number_ranks" value="2"/>
 			<param name="num_banks_of_DRAM_chip" value="8"/>			
 			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
 			<param name="output_width_of_DRAM_chip" value="8"/>
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
 			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
 			<param name="burstlength_of_DRAM_chip" value="8"/>
 			<stat name="memory_accesses" value="1052"/>
 			<stat name="memory_reads" value="1052"/>
 			<stat name="memory_writes" value="1052"/>									
 		</component>
 		<component id="system.mc" name="mc">
 			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
 			<!-- current version of McPAT uses published values for base parameters of memory controller
 			improvments on MC will be added in later versions. -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
 			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
 			<param name="block_size" value="64"/><!--B-->
 			<param name="number_mcs" value="0"/>
 			<!-- current McPAT only supports homogeneous memory controllers -->
 			<param name="memory_channels_per_mc" value="1"/>
 			<param name="number_ranks" value="2"/>
 			<param name="withPHY" value="0"/>
 			<!-- # of ranks of each channel-->
 			<param name="req_window_size_per_channel" value="32"/>
 			<param name="IO_buffer_size_per_channel" value="32"/>
 			<param name="databus_width" value="128"/>
 			<param name="addressbus_width" value="51"/>
 			<!-- McPAT will add the control bus width to the addressbus width automatically -->
 			<stat name="memory_accesses" value="33333"/>
 			<stat name="memory_reads" value="16667"/>
 			<stat name="memory_writes" value="16667"/>
 			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
 			the average power per MC or per channel. This is sufficent for most application. 
 			Further trackdown can be easily added in later versions. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.niu" name="niu">
 			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
 			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
 				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
 			the average power per nic or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.pcie" name="pcie">
 			<!-- On chip PCIe controller, including Phy-->
 			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
 				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
 			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
 			<param name="withPHY" value="1"/>
 			<param name="clockrate" value="350"/>
 			<param name="number_units" value="0"/>
 			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
 			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
 		</component>
 <!--**********************************************************************-->
 		<component id="system.flashc" name="flashc">
 		    <param name="number_flashcs" value="0"/>
 			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
            <param name="withPHY" value="1"/>
 			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
 			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
 			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
 			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
 			the average power per fc or per channel. This is sufficent for most application -->  			
 		</component>
 <!--**********************************************************************-->
 		</component>
 </component>
--- a/ext/mcpat/arch_const.h
+++ b/ext/mcpat/arch_const.h
@ -0,0 +1,276 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef ARCH_CONST_H_
 #define ARCH_CONST_H_
 typedef struct{
        unsigned int capacity;
        unsigned int assoc;//fully
        unsigned int blocksize;
 } array_inputs;
 //Do Not change, unless you want to bypass the XML interface and do not care about the default values.
 //Global parameters
 const int  			number_of_cores =	8;
 const int  			number_of_L2s 	=	1;
 const int 			number_of_L3s	=	1;
 const int 			number_of_NoCs	=	1;
 const double 		archi_F_sz_nm	=	90.0;
 const unsigned int 	dev_type		=	0;
 const double 		CLOCKRATE 		= 	1.2*1e9;
 const double 		AF 				= 	0.5;
 //const bool 			inorder			=	true;
 const bool			embedded		=	false; //NEW
 const bool 			homogeneous_cores	= 	true;
 const bool 			temperature		=	360;
 const int			number_cache_levels	=	3;
 const int			L1_property		=	0; //private 0; coherent 1, shared 2.
 const int		 	L2_property		=	2;
 const bool	    	homogeneous_L2s	=	true;
 const bool		    L3_property		= 	2;
 const bool 			homogeneous_L3s	=	true;
 const double 		Max_area_deviation	=	50;
 const double	    Max_dynamic_deviation	=50; //New
 const int 			opt_dynamic_power	=	1;
 const int 			opt_lakage_power	=	0;
 const int		 	opt_area			=	0;
 const int			interconnect_projection_type	=	0;
 //******************************Core Parameters
 #if (inorder)
 const int opcode_length			= 	8;//Niagara
 const int reg_length			=	5;//Niagara
 const int instruction_length	=	32;//Niagara
 const int data_width			=	64;
 #else
 const int opcode_length			= 	8;//16;//Niagara
 const int reg_length			=	7;//Niagara
 const int instruction_length	=	32;//Niagara
 const int data_width			=	64;
 #endif
 //Caches
 //itlb
 const int itlbsize=512;
 const int itlbassoc=0;//fully
 const int itlbblocksize=8;
 //icache
 const int icachesize=32768;
 const int icacheassoc=4;
 const int icacheblocksize=32;
 //dtlb
 const int dtlbsize=512;
 const int dtlbassoc=0;//fully
 const int dtlbblocksize=8;
 //dcache
 const int dcachesize=32768;
 const int dcacheassoc=4;
 const int dcacheblocksize=32;
 const int dcache_write_buffers=8;
 //cache controllers
 //IB,
 const int numIBEntries			=	64;
 const int IBsize				=	64;//2*4*instruction_length/8*2;
 const int IBassoc				=	0;//In Niagara it is still fully associ
 const int IBblocksize			=	4;
 //IFB and MIL should have the same parameters CAM
 const int IFBsize=128;//
 const int IFBassoc=0;//In Niagara it is still fully associ
 const int IFBblocksize=4;
 const int icache_write_buffers=8;
 //register file RAM
 const int regfilesize=5760;
 const int regfileassoc=1;
 const int regfileblocksize=18;
 //regwin  RAM
 const int regwinsize=256;
 const int regwinassoc=1;
 const int regwinblocksize=8;
 //store buffer, lsq
 const int lsqsize=512;
 const int lsqassoc=0;
 const int lsqblocksize=8;
 //data fill queue RAM
 const int dfqsize=1024;
 const int dfqassoc=1;
 const int dfqblocksize=16;
 //outside the cores
 //L2 cache bank
 const int l2cachesize=262144;
 const int l2cacheassoc=16;
 const int l2cacheblocksize=64;
 //L2 directory
 const int l2dirsize=1024;
 const int l2dirassoc=0;
 const int l2dirblocksize=2;
 //crossbar
 //PCX
 const int PCX_NUMBER_INPUT_PORTS_CROSSBAR = 8;
 const int PCX_NUMBER_OUTPUT_PORTS_CROSSBAR = 9;
 const int PCX_NUMBER_SIGNALS_PER_PORT_CROSSBAR =144;
 //PCX buffer RAM
 const int pcx_buffersize=1024;
 const int pcx_bufferassoc=1;
 const int pcx_bufferblocksize=32;
 const int pcx_numbuffer=5;
 //pcx arbiter
 const int pcx_arbsize=128;
 const int pcx_arbassoc=1;
 const int pcx_arbblocksize=2;
 const int pcx_numarb=5;
 //CPX
 const int CPX_NUMBER_INPUT_PORTS_CROSSBAR = 5;
 const int CPX_NUMBER_OUTPUT_PORTS_CROSSBAR = 8;
 const int CPX_NUMBER_SIGNALS_PER_PORT_CROSSBAR =150;
 //CPX buffer RAM
 const int cpx_buffersize=1024;
 const int cpx_bufferassoc=1;
 const int cpx_bufferblocksize=32;
 const int cpx_numbuffer=8;
 //cpx arbiter
 const int cpx_arbsize=128;
 const int cpx_arbassoc=1;
 const int cpx_arbblocksize=2;
 const int cpx_numarb=8;
 const int numPhysFloatRegs=256;
 const int numPhysIntRegs=32;
 const int numROBEntries=192;
 const int umRobs=1;
 const int BTBEntries=4096;
 const int BTBTagSize=16;
 const int LFSTSize=1024;
 const int LQEntries=32;
 const int RASSize=16;
 const int SQEntries=32;
 const int SSITSize=1024;
 const int activity=0;
 const int backComSize=5;
 const int cachePorts=200;
 const int choiceCtrBits=2;
 const int choicePredictorSize=8192;
 const int commitWidth=8;
 const int decodeWidth=8;
 const int dispatchWidth=8;
 const int fetchWidth=8;
 const int issueWidth=1;
 const int renameWidth=8;
 //what is this forwardComSize=5??
 const int globalCtrBits=2;
 const int globalHistoryBits=13;
 const int globalPredictorSize=8192;
 const int localCtrBits=2;
 const int localHistoryBits=11;
 const int localHistoryTableSize=2048;
 const int localPredictorSize=2048;
 const double Woutdrvnandn	=30 *0.09;//(24.0 * LSCALE)
 const double Woutdrvnandp	=12.5 *0.09;//(10.0 * LSCALE)
 const double Woutdrvnorn	=7.5*0.09;//(6.0 * LSCALE)
 const double Woutdrvnorp  =50 * 0.09;//	(40.0 * LSCALE)
 const double Woutdrivern	=60*0.09;//(48.0 * LSCALE)
 const double Woutdriverp	=100 * 0.09;//(80.0 * LSCALE)
 /*
 smtCommitPolicy=RoundRobin
 smtFetchPolicy=SingleThread
 smtIQPolicy=Partitioned
 smtIQThreshold=100
 smtLSQPolicy=Partitioned
 smtLSQThreshold=100
 smtNumFetchingThreads=1
 smtROBPolicy=Partitioned
 smtROBThreshold=100
 squashWidth=8
 */
 /*
 prefetch_access=false
 prefetch_cache_check_push=true
 prefetch_data_accesses_only=false
 prefetch_degree=1
 prefetch_latency=10000
 prefetch_miss=false
 prefetch_past_page=false
 prefetch_policy=none
 prefetch_serial_squash=false
 prefetch_use_cpu_id=true
 prefetcher_size=100
 prioritizeRequests=false
 repl=Null
 split=false
 split_size=0
 subblock_size=0
 tgts_per_mshr=20
 trace_addr=0
 two_queue=false
 cpu_side=system.cpu0.dcache_port
 mem_side=system.tol2bus.port[2]
 */
 //[system.cpu0.dtb]
 //type=AlphaDT
 #endif /* ARCH_CONST_H_ */
--- a/ext/mcpat/array.cc
+++ b/ext/mcpat/array.cc
@ -0,0 +1,302 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #define  GLOBALVAR
 #include <cassert>
 #include <cmath>
 #include <iostream>
 #include "area.h"
 #include "array.h"
 #include "decoder.h"
 #include "globalvar.h"
 #include "parameter.h"
 using namespace std;
 ArrayST::ArrayST(const InputParameter *configure_interface,
                               string _name,
                               enum Device_ty device_ty_,
                               bool opt_local_,
                               enum Core_type core_ty_,
                               bool _is_default)
 :l_ip(*configure_interface),
 name(_name),
 device_ty(device_ty_),
 opt_local(opt_local_),
 core_ty(core_ty_),
 is_default(_is_default)
    {
        if (l_ip.cache_sz<64) l_ip.cache_sz=64;
        l_ip.error_checking();//not only do the error checking but also fill some missing parameters
        optimize_array();
 }
 void ArrayST::compute_base_power()
    {
        //l_ip.out_w               =l_ip.line_sz*8;
    local_result=cacti_interface(&l_ip);
    }
 void ArrayST::optimize_array()
 {
        list<uca_org_t > candidate_solutions(0);
        list<uca_org_t >::iterator candidate_iter, min_dynamic_energy_iter;
        uca_org_t * temp_res = 0;
        local_result.valid=false;
        double 	throughput=l_ip.throughput, latency=l_ip.latency;
        double  area_efficiency_threshold = 20.0;
        bool 	throughput_overflow=true, latency_overflow=true;
        compute_base_power();
        if ((local_result.cycle_time - throughput) <= 1e-10 )
                throughput_overflow=false;
        if ((local_result.access_time - latency)<= 1e-10)
                latency_overflow=false;
        if (opt_for_clk && opt_local)
        {
                if (throughput_overflow || latency_overflow)
                {
                        l_ip.ed=0;
                        l_ip.delay_wt                = 100;//Fixed number, make sure timing can be satisfied.
                        l_ip.cycle_time_wt           = 1000;
                        l_ip.area_wt                 = 10;//Fixed number, This is used to exhaustive search for individual components.
                        l_ip.dynamic_power_wt        = 10;//Fixed number, This is used to exhaustive search for individual components.
                        l_ip.leakage_power_wt        = 10;
                        l_ip.delay_dev               = 1000000;//Fixed number, make sure timing can be satisfied.
                        l_ip.cycle_time_dev          = 100;
                        l_ip.area_dev                = 1000000;//Fixed number, This is used to exhaustive search for individual components.
                        l_ip.dynamic_power_dev       = 1000000;//Fixed number, This is used to exhaustive search for individual components.
                        l_ip.leakage_power_dev       = 1000000;
                        throughput_overflow=true; //Reset overflow flag before start optimization iterations
                        latency_overflow=true;
                        temp_res = &local_result; //Clean up the result for optimized for ED^2P
                        temp_res->cleanup();
                }
                while ((throughput_overflow || latency_overflow)&&l_ip.cycle_time_dev > 10)// && l_ip.delay_dev > 10
                {
                        compute_base_power();
                        l_ip.cycle_time_dev-=10;//This is the time_dev to be used for next iteration
                        //		from best area to worst area -->worst timing to best timing
                        if ((((local_result.cycle_time - throughput) <= 1e-10 ) && (local_result.access_time - latency)<= 1e-10)||
                                        (local_result.data_array2->area_efficiency < area_efficiency_threshold && l_ip.assoc == 0))
                        {  //if no satisfiable solution is found,the most aggressive one is left
                                candidate_solutions.push_back(local_result);
                                //output_data_csv(candidate_solutions.back());
                                if (((local_result.cycle_time - throughput) <= 1e-10) && ((local_result.access_time - latency)<= 1e-10))
                                        //ensure stop opt not because of cam
                                {
                                        throughput_overflow=false;
                                        latency_overflow=false;
                                }
                        }
                        else
                        {
                                //TODO: whether checking the partial satisfied results too, or just change the mark???
                                if ((local_result.cycle_time - throughput) <= 1e-10)
                                                                                throughput_overflow=false;
                                if ((local_result.access_time - latency)<= 1e-10)
                                                                                latency_overflow=false;
                                if (l_ip.cycle_time_dev > 10)
                                {   //if not >10 local_result is the last result, it cannot be cleaned up
                                        temp_res = &local_result; //Only solutions not saved in the list need to be cleaned up
                                        temp_res->cleanup();
                                }
                        }
 //			l_ip.cycle_time_dev-=10;
 //			l_ip.delay_dev-=10;
                }
        if (l_ip.assoc > 0)
        {
                //For array structures except CAM and FA, Give warning but still provide a result with best timing found
                if (throughput_overflow==true)
                        cout<< "Warning: " << name<<" array structure cannot satisfy throughput constraint." << endl;
                if (latency_overflow==true)
                        cout<< "Warning: " << name<<" array structure cannot satisfy latency constraint." << endl;
        }
 //	else
 //	{
 //		/*According to "Content-Addressable Memory (CAM) Circuits and
 //				Architectures": A Tutorial and Survey
 //				by Kostas Pagiamtzis et al.
 //				CAM structures can be heavily pipelined and use look-ahead techniques,
 //				therefore timing can be relaxed. But McPAT does not model the advanced
 //				techniques. If continue optimizing, the area efficiency will be too low
 //		*/
 //		//For CAM and FA, stop opt if area efficiency is too low
 //		if (throughput_overflow==true)
 //			cout<< "Warning: " <<" McPAT stopped optimization on throughput for "<< name
 //				<<" array structure because its area efficiency is below "<<area_efficiency_threshold<<"% " << endl;
 //		if (latency_overflow==true)
 //			cout<< "Warning: " <<" McPAT stopped optimization on latency for "<< name
 //				<<" array structure because its area efficiency is below "<<area_efficiency_threshold<<"% " << endl;
 //	}
                //double min_dynamic_energy, min_dynamic_power, min_leakage_power, min_cycle_time;
                double min_dynamic_energy=BIGNUM;
                if (candidate_solutions.empty()==false)
                {
                        local_result.valid=true;
                        for (candidate_iter = candidate_solutions.begin(); candidate_iter != candidate_solutions.end(); ++candidate_iter)
                        {
                                if (min_dynamic_energy > (candidate_iter)->power.readOp.dynamic)
                                {
                                        min_dynamic_energy = (candidate_iter)->power.readOp.dynamic;
                                        min_dynamic_energy_iter = candidate_iter;
                                        local_result = *(min_dynamic_energy_iter);
                                        //TODO: since results are reordered results and l_ip may miss match. Therefore, the final output spread sheets may show the miss match.
                                }
                                else
                                {
                                        candidate_iter->cleanup() ;
                                }
                        }
                }
        candidate_solutions.clear();
        }
        double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
        double macro_layout_overhead   = g_tp.macro_layout_overhead;
        double chip_PR_overhead        = g_tp.chip_layout_overhead;
        double total_overhead          = macro_layout_overhead*chip_PR_overhead;
        local_result.area *= total_overhead;
        //maintain constant power density
        double pppm_t[4]    = {total_overhead,1,1,total_overhead};
        double sckRation = g_tp.sckt_co_eff;
        local_result.power.readOp.dynamic *= sckRation;
        local_result.power.writeOp.dynamic *= sckRation;
        local_result.power.searchOp.dynamic *= sckRation;
        local_result.power.readOp.leakage *= l_ip.nbanks;
        local_result.power.readOp.longer_channel_leakage =
                local_result.power.readOp.leakage*long_channel_device_reduction;
        local_result.power = local_result.power* pppm_t;
        local_result.data_array2->power.readOp.dynamic *= sckRation;
        local_result.data_array2->power.writeOp.dynamic *= sckRation;
        local_result.data_array2->power.searchOp.dynamic *= sckRation;
        local_result.data_array2->power.readOp.leakage *= l_ip.nbanks;
        local_result.data_array2->power.readOp.longer_channel_leakage =
                local_result.data_array2->power.readOp.leakage*long_channel_device_reduction;
        local_result.data_array2->power = local_result.data_array2->power* pppm_t;
        if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache)
        {
                local_result.tag_array2->power.readOp.dynamic *= sckRation;
                local_result.tag_array2->power.writeOp.dynamic *= sckRation;
                local_result.tag_array2->power.searchOp.dynamic *= sckRation;
                local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks;
                local_result.tag_array2->power.readOp.longer_channel_leakage =
                        local_result.tag_array2->power.readOp.leakage*long_channel_device_reduction;
                local_result.tag_array2->power = local_result.tag_array2->power* pppm_t;
        }
 }
 void ArrayST::leakage_feedback(double temperature)
 {
  // Update the temperature. l_ip is already set and error-checked in the creator function.
  l_ip.temp = (unsigned int)round(temperature/10.0)*10;
  // This corresponds to cacti_interface() in the initialization process. Leakage power is updated here.
  reconfigure(&l_ip,&local_result);
  // Scale the power values. This is part of ArrayST::optimize_array().
  double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
  double macro_layout_overhead   = g_tp.macro_layout_overhead;
  double chip_PR_overhead        = g_tp.chip_layout_overhead;
  double total_overhead          = macro_layout_overhead*chip_PR_overhead;
  double pppm_t[4]    = {total_overhead,1,1,total_overhead};
  double sckRation = g_tp.sckt_co_eff;
  local_result.power.readOp.dynamic *= sckRation;
  local_result.power.writeOp.dynamic *= sckRation;
  local_result.power.searchOp.dynamic *= sckRation;
  local_result.power.readOp.leakage *= l_ip.nbanks;
  local_result.power.readOp.longer_channel_leakage = local_result.power.readOp.leakage*long_channel_device_reduction;
  local_result.power = local_result.power* pppm_t;
  local_result.data_array2->power.readOp.dynamic *= sckRation;
  local_result.data_array2->power.writeOp.dynamic *= sckRation;
  local_result.data_array2->power.searchOp.dynamic *= sckRation;
  local_result.data_array2->power.readOp.leakage *= l_ip.nbanks;
  local_result.data_array2->power.readOp.longer_channel_leakage = local_result.data_array2->power.readOp.leakage*long_channel_device_reduction;
  local_result.data_array2->power = local_result.data_array2->power* pppm_t;
  if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache)
  {
    local_result.tag_array2->power.readOp.dynamic *= sckRation;
    local_result.tag_array2->power.writeOp.dynamic *= sckRation;
    local_result.tag_array2->power.searchOp.dynamic *= sckRation;
    local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks;
    local_result.tag_array2->power.readOp.longer_channel_leakage = local_result.tag_array2->power.readOp.leakage*long_channel_device_reduction;
    local_result.tag_array2->power = local_result.tag_array2->power* pppm_t;
  }
 }
 ArrayST:: ~ArrayST()
 {
        local_result.cleanup();
 }
--- a/ext/mcpat/array.h
+++ b/ext/mcpat/array.h
@ -0,0 +1,101 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef ARRAY_H_
 #define ARRAY_H_
 #include <iostream>
 #include <string>
 #include "basic_components.h"
 #include "cacti_interface.h"
 #include "component.h"
 #include "const.h"
 #include "parameter.h"
 using namespace std;
 class ArrayST :public Component{
 public:
  ArrayST(){};
  ArrayST(const InputParameter *configure_interface, string _name, enum Device_ty device_ty_, bool opt_local_=true, enum Core_type core_ty_=Inorder,  bool _is_default=true);
  InputParameter l_ip;
  string         name;
  enum Device_ty device_ty;
  bool opt_local;
  enum Core_type core_ty;
  bool           is_default;
  uca_org_t      local_result;
  statsDef       tdp_stats;
  statsDef       rtp_stats;
  statsDef       stats_t;
  powerDef       power_t;
  virtual void optimize_array();
  virtual void compute_base_power();
  virtual ~ArrayST();
  void leakage_feedback(double temperature);
 };
 class InstCache :public Component{
 public:
  ArrayST* caches;
  ArrayST* missb;
  ArrayST* ifb;
  ArrayST* prefetchb;
  powerDef power_t;//temp value holder for both (max) power and runtime power
  InstCache(){caches=0;missb=0;ifb=0;prefetchb=0;};
  ~InstCache(){
          if (caches)    {//caches->local_result.cleanup();
                                          delete caches; caches=0;}
          if (missb)     {//missb->local_result.cleanup();
                                          delete missb; missb=0;}
          if (ifb)       {//ifb->local_result.cleanup();
                                          delete ifb; ifb=0;}
          if (prefetchb) {//prefetchb->local_result.cleanup();
                                          delete prefetchb; prefetchb=0;}
   };
 };
 class DataCache :public InstCache{
 public:
  ArrayST* wbb;
  DataCache(){wbb=0;};
  ~DataCache(){
          if (wbb) {//wbb->local_result.cleanup();
                                delete wbb; wbb=0;}
   };
 };
 #endif /* TLB_H_ */
--- a/ext/mcpat/basic_components.cc
+++ b/ext/mcpat/basic_components.cc
@ -0,0 +1,127 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <cassert>
 #include <cmath>
 #include <iostream>
 #include "basic_components.h"
 double longer_channel_device_reduction(
                enum Device_ty device_ty,
                enum Core_type core_ty)
 {
        double longer_channel_device_percentage_core;
        double longer_channel_device_percentage_uncore;
        double longer_channel_device_percentage_llc;
        double long_channel_device_reduction;
        longer_channel_device_percentage_llc    = 1.0;
        longer_channel_device_percentage_uncore = 0.82;
        if (core_ty==OOO)
        {
                longer_channel_device_percentage_core   = 0.56;//0.54 Xeon Tulsa //0.58 Nehelam
                //longer_channel_device_percentage_uncore = 0.76;//0.85 Nehelam
        }
        else
        {
                longer_channel_device_percentage_core   = 0.8;//0.8;//Niagara
                //longer_channel_device_percentage_uncore = 0.9;//Niagara
        }
        if (device_ty==Core_device)
        {
                long_channel_device_reduction = (1- longer_channel_device_percentage_core)
                + longer_channel_device_percentage_core * g_tp.peri_global.long_channel_leakage_reduction;
        }
        else if (device_ty==Uncore_device)
        {
                long_channel_device_reduction = (1- longer_channel_device_percentage_uncore)
                + longer_channel_device_percentage_uncore * g_tp.peri_global.long_channel_leakage_reduction;
        }
        else if (device_ty==LLC_device)
        {
                long_channel_device_reduction = (1- longer_channel_device_percentage_llc)
                + longer_channel_device_percentage_llc * g_tp.peri_global.long_channel_leakage_reduction;
        }
        else
        {
                cout<<"unknown device category"<<endl;
                exit(0);
        }
        return long_channel_device_reduction;
 }
 statsComponents operator+(const statsComponents & x, const statsComponents & y)
 {
        statsComponents z;
        z.access = x.access + y.access;
        z.hit    = x.hit + y.hit;
        z.miss   = x.miss  + y.miss;
        return z;
 }
 statsComponents operator*(const statsComponents & x, double const * const y)
 {
        statsComponents z;
        z.access = x.access*y[0];
        z.hit    = x.hit*y[1];
        z.miss   = x.miss*y[2];
        return z;
 }
 statsDef operator+(const statsDef & x, const statsDef & y)
 {
        statsDef z;
        z.readAc   = x.readAc  + y.readAc;
        z.writeAc  = x.writeAc + y.writeAc;
        z.searchAc  = x.searchAc + y.searchAc;
        return z;
 }
 statsDef operator*(const statsDef & x, double const * const y)
 {
        statsDef z;
        z.readAc   = x.readAc*y;
        z.writeAc  = x.writeAc*y;
        z.searchAc  = x.searchAc*y;
        return z;
 }
--- a/ext/mcpat/basic_components.h
+++ b/ext/mcpat/basic_components.h
@ -0,0 +1,265 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef BASIC_COMPONENTS_H_
 #define BASIC_COMPONENTS_H_
 #include <vector>
 #include "XML_Parse.h"
 #include "parameter.h"
 const double cdb_overhead = 1.1;
 enum FU_type {
    FPU,
    ALU,
    MUL
 };
 enum Core_type {
        OOO,
        Inorder
 };
 enum Renaming_type {
    RAMbased,
        CAMbased
 };
 enum Scheduler_type {
    PhysicalRegFile,
        ReservationStation
 };
 enum cache_level {
    L2,
    L3,
    L1Directory,
    L2Directory
 };
 enum MemoryCtrl_type {
        MC,    //memory controller
        FLASHC //flash controller
 };
 enum Dir_type {
        ST,//shadowed tag
        DC,//directory cache
        SBT,//static bank tag
        NonDir
 };
 enum Cache_policy {
        Write_through,
        Write_back
 };
 enum Device_ty {
        Core_device,
        Uncore_device,
        LLC_device
 };
 class statsComponents
 {
  public:
    double access;
    double hit;
    double miss;
    statsComponents() : access(0), hit(0), miss(0)  {}
    statsComponents(const statsComponents & obj) { *this = obj; }
    statsComponents & operator=(const statsComponents & rhs)
    {
      access = rhs.access;
      hit = rhs.hit;
      miss  = rhs.miss;
      return *this;
    }
    void reset() { access = 0; hit = 0; miss = 0;}
    friend statsComponents operator+(const statsComponents & x, const statsComponents & y);
    friend statsComponents operator*(const statsComponents & x, double const * const y);
 };
 class statsDef
 {
  public:
    statsComponents readAc;
    statsComponents writeAc;
    statsComponents searchAc;
    statsDef() : readAc(), writeAc(),searchAc() { }
    void reset() { readAc.reset(); writeAc.reset();searchAc.reset();}
    friend statsDef operator+(const statsDef & x, const statsDef & y);
    friend statsDef operator*(const statsDef & x, double const * const y);
 };
 double longer_channel_device_reduction(
                enum Device_ty device_ty=Core_device,
                enum Core_type core_ty=Inorder);
 class CoreDynParam {
 public:
        CoreDynParam(){};
        CoreDynParam(ParseXML *XML_interface, int ithCore_);
        //    :XML(XML_interface),
        //     ithCore(ithCore_)
        //     core_ty(inorder),
        //     rm_ty(CAMbased),
        //     scheu_ty(PhysicalRegFile),
        //     clockRate(1e9),//1GHz
        //     arch_ireg_width(32),
        //     arch_freg_width(32),
        //     phy_ireg_width(128),
        //     phy_freg_width(128),
        //     perThreadState(8),
        //     globalCheckpoint(32),
        //     instructionLength(32){};
        //ParseXML * XML;
        bool opt_local;
        bool x86;
        bool Embedded;
    enum Core_type  core_ty;
        enum Renaming_type rm_ty;
    enum Scheduler_type scheu_ty;
    double clockRate,executionTime;
    int  arch_ireg_width, arch_freg_width, phy_ireg_width, phy_freg_width;
    int  num_IRF_entry, num_FRF_entry, num_ifreelist_entries, num_ffreelist_entries;
    int  fetchW, decodeW,issueW,peak_issueW, commitW,peak_commitW, predictionW, fp_issueW, fp_decodeW;
    int  perThreadState, globalCheckpoint, instruction_length, pc_width, opcode_length, micro_opcode_length;
    int  num_hthreads, pipeline_stages, fp_pipeline_stages, num_pipelines, num_fp_pipelines;
    int  num_alus, num_muls;
    double num_fpus;
    int  int_data_width, fp_data_width,v_address_width, p_address_width;
    double pipeline_duty_cycle, total_cycles, busy_cycles, idle_cycles;
    bool regWindowing,multithreaded;
    double pppm_lkg_multhread[4];
        double IFU_duty_cycle,BR_duty_cycle,LSU_duty_cycle,MemManU_I_duty_cycle,
               MemManU_D_duty_cycle, ALU_duty_cycle,MUL_duty_cycle,
               FPU_duty_cycle, ALU_cdb_duty_cycle,MUL_cdb_duty_cycle,
               FPU_cdb_duty_cycle;
    ~CoreDynParam(){};
 };
 class CacheDynParam {
 public:
        CacheDynParam(){};
        CacheDynParam(ParseXML *XML_interface, int ithCache_);
    string name;
        enum Dir_type    dir_ty;
        double clockRate,executionTime;
    double    capacity, blockW, assoc, nbanks;
    double throughput, latency;
    double duty_cycle, dir_duty_cycle;
    //double duty_cycle;
    int missb_size, fu_size, prefetchb_size, wbb_size;
    ~CacheDynParam(){};
 };
 class MCParam {
 public:
        MCParam(){};
        MCParam(ParseXML *XML_interface, int ithCache_);
    string name;
    double  clockRate,num_mcs, peakDataTransferRate, num_channels;
    //  double mcTEPowerperGhz;
    //	double mcPHYperGbit;
    //	double area;
    int	   llcBlockSize, dataBusWidth, addressBusWidth;
    int    opcodeW;
    int    memAccesses;
    int    memRank;
    int    type;
    double frontend_duty_cycle, duty_cycle, perc_load;
    double executionTime, reads, writes;
    bool   LVDS, withPHY;
    ~MCParam(){};
 };
 class NoCParam {
 public:
        NoCParam(){};
        NoCParam(ParseXML *XML_interface, int ithCache_);
    string name;
    double  clockRate;
    int	   flit_size;
    int    input_ports, output_ports, min_ports, global_linked_ports;
    int    virtual_channel_per_port,input_buffer_entries_per_vc;
    int    horizontal_nodes,vertical_nodes, total_nodes;
    double executionTime, total_access, link_throughput,link_latency,
                   duty_cycle, chip_coverage, route_over_perc;
    bool   has_global_link, type;
    ~NoCParam(){};
 };
 class ProcParam {
 public:
        ProcParam(){};
        ProcParam(ParseXML *XML_interface, int ithCache_);
    string name;
    int  numCore, numL2, numL3, numNOC, numL1Dir, numL2Dir,numMC, numMCChannel;
    bool homoCore, homoL2, homoL3, homoNOC, homoL1Dir, homoL2Dir;
    ~ProcParam(){};
 };
 class NIUParam {
 public:
        NIUParam(){};
        NIUParam(ParseXML *XML_interface, int ithCache_);
    string name;
    double  clockRate;
    int    num_units;
    int    type;
    double duty_cycle, perc_load;
    ~NIUParam(){};
 };
 class PCIeParam {
 public:
        PCIeParam(){};
        PCIeParam(ParseXML *XML_interface, int ithCache_);
    string name;
    double  clockRate;
    int    num_channels, num_units;
    bool   withPHY;
    int    type;
    double duty_cycle, perc_load;
    ~PCIeParam(){};
 };
 #endif /* BASIC_COMPONENTS_H_ */
--- a/ext/mcpat/cacti/README
+++ b/ext/mcpat/cacti/README
@ -0,0 +1,94 @@
 -----------------------------------------------------------
          ____    _    ____ _____ ___    __    ____  
         / ___|  / \  / ___|_   _|_ _|  / /_  | ___| 
        | |     / _ \| |     | |  | |  | '_ \ |___ \ 
        | |___ / ___ \ |___  | |  | |  | (_) | ___) |
         \____/_/   \_\____| |_| |___|  \___(_)____/ 
             A Tool to Model Caches/Memories
 -----------------------------------------------------------
 CACTI is an analytical tool that takes a set of cache/memory para-
 meters as input and calculates its access time, power, cycle 
 time, and area.
 CACTI was originally developed by Dr. Jouppi and Dr. Wilton
 in 1993 and since then it has undergone five major 
 revisions.
 List of features (version 1-6.5):
 ===============================
 The following is the list of features supported by the tool. 
 * Power, delay, area, and cycle time model for 
                  direct mapped caches
                  set-associative caches
                  fully associative caches
                  Embedded DRAM memories
                  Commodity DRAM memories
 * Support for modeling multi-ported uniform cache access (UCA)
  and multi-banked, multi-ported non-uniform cache access (NUCA).
 * Leakage power calculation that also considers the operating
  temperature of the cache.
 * Router power model.
 * Interconnect model with different delay, power, and area 
  properties including low-swing wire model.
 * An interface to perform trade-off analysis involving power, delay,
  area, and bandwidth.
 * All process specific values used by the tool are obtained
  from ITRS and currently, the tool supports 90nm, 65nm, 45nm, 
  and 32nm technology nodes.
 Version 6.5 has a new c++ code base and includes numerous bug fixes.
 CACTI 5.3 and 6.0 activate an entire row of mats to read/write a single
 block of data. This technique improves reliability at the cost of  
 power. CACTI 6.5 activates minimum number of mats just enough to retrieve 
 a block to minimize power.
 How to use the tool?
 ====================
 Prior versions of CACTI take input parameters such as cache
 size and technology node as a set of command line arguments. 
 To avoid a long list of command line arguments, 
 CACTI 6.5 lets users specify their cache model in a more 
 detailed manner by using a config file (cache.cfg).
 -> define the cache model using cache.cfg
 -> run the "cacti" binary <./cacti -infile cache.cfg>
 CACTI6.5 also provides a command line interface similar to earlier versions
 of CACTI. The command line interface can be used as
 ./cacti  cache_size line_size associativity rw_ports excl_read_ports excl_write_ports 
  single_ended_read_ports search_ports banks tech_node output_width specific_tag tag_width
  access_mode cache main_mem obj_func_delay obj_func_dynamic_power obj_func_leakage_power
  obj_func_cycle_time obj_func_area dev_func_delay dev_func_dynamic_power dev_func_leakage_power
  dev_func_area dev_func_cycle_time ed_ed2_none temp wt data_arr_ram_cell_tech_flavor_in
  data_arr_peri_global_tech_flavor_in tag_arr_ram_cell_tech_flavor_in tag_arr_peri_global_tech_flavor_in
  interconnect_projection_type_in wire_inside_mat_type_in wire_outside_mat_type_in
  REPEATERS_IN_HTREE_SEGMENTS_in VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in 
  BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in PAGE_SIZE_BITS_in BURST_LENGTH_in
  INTERNAL_PREFETCH_WIDTH_in force_wiretype wiretype force_config ndwl ndbl nspd ndcm 
  ndsam1 ndsam2 ecc
 For complete documentation of the tool, please refer CACTI-5.3 and 6.0
 technical reports and the following paper,
 "Optimizing NUCA Organizations and Wiring Alternatives for 
 Large Caches With CACTI 6.0", that appears in MICRO 2007.
 We are still improving the tool and refining the code. If you
 have any comments, questions, or suggestions please write to
 us.
 Naveen Muralimanohar             Jung Ho Ahn        Sheng Li
 naveen.muralimanohar@hp.com      gajh@snu.ac.kr     sheng.li@hp.com
--- a/ext/mcpat/cacti/Ucache.cc
+++ b/ext/mcpat/cacti/Ucache.cc
@ -0,0 +1,916 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <pthread.h>
 #include <algorithm>
 #include <cmath>
 #include <ctime>
 #include <iostream>
 #include <list>
 #include "Ucache.h"
 #include "area.h"
 #include "bank.h"
 #include "basic_circuit.h"
 #include "component.h"
 #include "const.h"
 #include "decoder.h"
 #include "parameter.h"
 #include "subarray.h"
 #include "uca.h"
 using namespace std;
 const uint32_t nthreads = NTHREADS;
 void min_values_t::update_min_values(const min_values_t * val)
 {
  min_delay   = (min_delay > val->min_delay) ? val->min_delay : min_delay;
  min_dyn     = (min_dyn > val->min_dyn) ? val->min_dyn : min_dyn;
  min_leakage = (min_leakage > val->min_leakage) ? val->min_leakage : min_leakage;
  min_area    = (min_area > val->min_area) ? val->min_area : min_area;
  min_cyc     = (min_cyc > val->min_cyc) ? val->min_cyc : min_cyc;
 }
 void min_values_t::update_min_values(const uca_org_t & res)
 {
  min_delay   = (min_delay > res.access_time) ? res.access_time : min_delay;
  min_dyn     = (min_dyn > res.power.readOp.dynamic) ? res.power.readOp.dynamic : min_dyn;
  min_leakage = (min_leakage > res.power.readOp.leakage) ? res.power.readOp.leakage : min_leakage;
  min_area    = (min_area > res.area) ? res.area : min_area;
  min_cyc     = (min_cyc > res.cycle_time) ? res.cycle_time : min_cyc;
 }
 void min_values_t::update_min_values(const nuca_org_t * res)
 {
  min_delay   = (min_delay > res->nuca_pda.delay) ? res->nuca_pda.delay : min_delay;
  min_dyn     = (min_dyn > res->nuca_pda.power.readOp.dynamic) ? res->nuca_pda.power.readOp.dynamic : min_dyn;
  min_leakage = (min_leakage > res->nuca_pda.power.readOp.leakage) ? res->nuca_pda.power.readOp.leakage : min_leakage;
  min_area    = (min_area > res->nuca_pda.area.get_area()) ? res->nuca_pda.area.get_area() : min_area;
  min_cyc     = (min_cyc > res->nuca_pda.cycle_time) ? res->nuca_pda.cycle_time : min_cyc;
 }
 void min_values_t::update_min_values(const mem_array * res)
 {
  min_delay   = (min_delay > res->access_time) ? res->access_time : min_delay;
  min_dyn     = (min_dyn > res->power.readOp.dynamic) ? res->power.readOp.dynamic : min_dyn;
  min_leakage = (min_leakage > res->power.readOp.leakage) ? res->power.readOp.leakage : min_leakage;
  min_area    = (min_area > res->area) ? res->area : min_area;
  min_cyc     = (min_cyc > res->cycle_time) ? res->cycle_time : min_cyc;
 }
 void * calc_time_mt_wrapper(void * void_obj)
 {
  calc_time_mt_wrapper_struct * calc_obj = (calc_time_mt_wrapper_struct *) void_obj;
  uint32_t tid                   = calc_obj->tid;
  list<mem_array *> & data_arr   = calc_obj->data_arr;
  list<mem_array *> & tag_arr    = calc_obj->tag_arr;
  bool is_tag                    = calc_obj->is_tag;
  bool pure_ram                  = calc_obj->pure_ram;
  bool pure_cam					 = calc_obj->pure_cam;
  bool is_main_mem               = calc_obj->is_main_mem;
  double Nspd_min                = calc_obj->Nspd_min;
  min_values_t * data_res        = calc_obj->data_res;
  min_values_t * tag_res         = calc_obj->tag_res;
  data_arr.clear();
  data_arr.push_back(new mem_array);
  tag_arr.clear();
  tag_arr.push_back(new mem_array);
  uint32_t Ndwl_niter = _log2(MAXDATAN) + 1;
  uint32_t Ndbl_niter = _log2(MAXDATAN) + 1;
  uint32_t Ndcm_niter = _log2(MAX_COL_MUX) + 1;
  uint32_t niter      = Ndwl_niter * Ndbl_niter * Ndcm_niter;
  bool is_valid_partition;
  int wt_min, wt_max;
  if (g_ip->force_wiretype) {
    if (g_ip->wt == 0) {
      wt_min = Low_swing;
      wt_max = Low_swing;
    }
    else {
      wt_min = Global;
      wt_max = Low_swing-1;
    }
  }
  else {
    wt_min = Global;
    wt_max = Low_swing;
  }
  for (double Nspd = Nspd_min; Nspd <= MAXDATASPD; Nspd *= 2)
  {
    for (int wr = wt_min; wr <= wt_max; wr++)
    {
      for (uint32_t iter = tid; iter < niter; iter += nthreads)
      {
        // reconstruct Ndwl, Ndbl, Ndcm
        unsigned int Ndwl = 1 << (iter / (Ndbl_niter * Ndcm_niter));
        unsigned int Ndbl = 1 << ((iter / (Ndcm_niter))%Ndbl_niter);
        unsigned int Ndcm = 1 << (iter % Ndcm_niter);
        for(unsigned int Ndsam_lev_1 = 1; Ndsam_lev_1 <= MAX_COL_MUX; Ndsam_lev_1 *= 2)
        {
          for(unsigned int Ndsam_lev_2 = 1; Ndsam_lev_2 <= MAX_COL_MUX; Ndsam_lev_2 *= 2)
          {
            //for debuging
            if (g_ip->force_cache_config && is_tag == false)
            {
              wr   = g_ip->wt;
              Ndwl = g_ip->ndwl;
              Ndbl = g_ip->ndbl;
              Ndcm = g_ip->ndcm;
              if(g_ip->nspd != 0) {
                  Nspd = g_ip->nspd;
              }
              if(g_ip->ndsam1 != 0) {
                  Ndsam_lev_1 = g_ip->ndsam1;
                  Ndsam_lev_2 = g_ip->ndsam2;
              }
            }
            if (is_tag == true)
            {
              is_valid_partition = calculate_time(is_tag, pure_ram, pure_cam, Nspd, Ndwl,
                  Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2,
                  tag_arr.back(), 0, NULL, NULL,
                  is_main_mem);
            }
            // If it's a fully-associative cache, the data array partition parameters are identical to that of
            // the tag array, so compute data array partition properties also here.
            if (is_tag == false || g_ip->fully_assoc)
            {
              is_valid_partition = calculate_time(is_tag/*false*/, pure_ram, pure_cam, Nspd, Ndwl,
                  Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2,
                  data_arr.back(), 0, NULL, NULL,
                  is_main_mem);
            }
            if (is_valid_partition)
            {
              if (is_tag == true)
              {
                tag_arr.back()->wt = (enum Wire_type) wr;
                tag_res->update_min_values(tag_arr.back());
                tag_arr.push_back(new mem_array);
              }
              if (is_tag == false || g_ip->fully_assoc)
              {
                data_arr.back()->wt = (enum Wire_type) wr;
                data_res->update_min_values(data_arr.back());
                data_arr.push_back(new mem_array);
              }
            }
            if (g_ip->force_cache_config && is_tag == false)
            {
                wr   = wt_max;
                iter = niter;
                if(g_ip->nspd != 0) {
                        Nspd = MAXDATASPD;
                }
                if (g_ip->ndsam1 != 0) {
                        Ndsam_lev_1 = MAX_COL_MUX+1;
                        Ndsam_lev_2 = MAX_COL_MUX+1;
                }
            }
          }
        }
      }
    }
  }
  delete data_arr.back();
  delete tag_arr.back();
  data_arr.pop_back();
  tag_arr.pop_back();
  pthread_exit(NULL);
 }
 bool calculate_time(
    bool is_tag,
    int pure_ram,
    bool pure_cam,
    double Nspd,
    unsigned int Ndwl,
    unsigned int Ndbl,
    unsigned int Ndcm,
    unsigned int Ndsam_lev_1,
    unsigned int Ndsam_lev_2,
    mem_array *ptr_array,
    int flag_results_populate,
    results_mem_array *ptr_results,
    uca_org_t *ptr_fin_res,
    bool is_main_mem)
 {
  DynamicParameter dyn_p(is_tag, pure_ram, pure_cam, Nspd, Ndwl, Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2, is_main_mem);
  if (dyn_p.is_valid == false)
  {
    return false;
  }
  UCA * uca = new UCA(dyn_p);
  if (flag_results_populate)
  { //For the final solution, populate the ptr_results data structure  -- TODO: copy only necessary variables
  }
  else
  {
          int num_act_mats_hor_dir = uca->bank.dp.num_act_mats_hor_dir;
          int num_mats = uca->bank.dp.num_mats;
          bool is_fa = uca->bank.dp.fully_assoc;
          bool pure_cam = uca->bank.dp.pure_cam;
        ptr_array->Ndwl = Ndwl;
    ptr_array->Ndbl = Ndbl;
    ptr_array->Nspd = Nspd;
    ptr_array->deg_bl_muxing = dyn_p.deg_bl_muxing;
    ptr_array->Ndsam_lev_1 = Ndsam_lev_1;
    ptr_array->Ndsam_lev_2 = Ndsam_lev_2;
    ptr_array->access_time = uca->access_time;
    ptr_array->cycle_time = uca->cycle_time;
    ptr_array->multisubbank_interleave_cycle_time = uca->multisubbank_interleave_cycle_time;
    ptr_array->area_ram_cells = uca->area_all_dataramcells;
    ptr_array->area   = uca->area.get_area();
    ptr_array->height = uca->area.h;
    ptr_array->width  = uca->area.w;
    ptr_array->mat_height = uca->bank.mat.area.h;
    ptr_array->mat_length = uca->bank.mat.area.w;
    ptr_array->subarray_height = uca->bank.mat.subarray.area.h;
    ptr_array->subarray_length = uca->bank.mat.subarray.area.w;
    ptr_array->power  = uca->power;
    ptr_array->delay_senseamp_mux_decoder =
      MAX(uca->delay_array_to_sa_mux_lev_1_decoder,
          uca->delay_array_to_sa_mux_lev_2_decoder);
    ptr_array->delay_before_subarray_output_driver         = uca->delay_before_subarray_output_driver;
    ptr_array->delay_from_subarray_output_driver_to_output = uca->delay_from_subarray_out_drv_to_out;
    ptr_array->delay_route_to_bank          = uca->htree_in_add->delay;
    ptr_array->delay_input_htree            = uca->bank.htree_in_add->delay;
    ptr_array->delay_row_predecode_driver_and_block = uca->bank.mat.r_predec->delay;
    ptr_array->delay_row_decoder            = uca->bank.mat.row_dec->delay;
    ptr_array->delay_bitlines               = uca->bank.mat.delay_bitline;
    ptr_array->delay_matchlines               = uca->bank.mat.delay_matchchline;
    ptr_array->delay_sense_amp              = uca->bank.mat.delay_sa;
    ptr_array->delay_subarray_output_driver = uca->bank.mat.delay_subarray_out_drv_htree;
    ptr_array->delay_dout_htree             = uca->bank.htree_out_data->delay;
    ptr_array->delay_comparator             = uca->bank.mat.delay_comparator;
    ptr_array->all_banks_height = uca->area.h;
    ptr_array->all_banks_width  = uca->area.w;
    ptr_array->area_efficiency = uca->area_all_dataramcells * 100 / (uca->area.get_area());
    ptr_array->power_routing_to_bank = uca->power_routing_to_bank;
    ptr_array->power_addr_input_htree = uca->bank.htree_in_add->power;
    ptr_array->power_data_input_htree = uca->bank.htree_in_data->power;
 //    cout<<"power_data_input_htree"<<uca->bank.htree_in_data->power.readOp.leakage<<endl;
    ptr_array->power_data_output_htree = uca->bank.htree_out_data->power;
 //    cout<<"power_data_output_htree"<<uca->bank.htree_out_data->power.readOp.leakage<<endl;
    ptr_array->power_row_predecoder_drivers = uca->bank.mat.r_predec->driver_power;
    ptr_array->power_row_predecoder_drivers.readOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_row_predecoder_drivers.writeOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_row_predecoder_drivers.searchOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_row_predecoder_blocks = uca->bank.mat.r_predec->block_power;
    ptr_array->power_row_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_row_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_row_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_row_decoders = uca->bank.mat.power_row_decoders;
    ptr_array->power_row_decoders.readOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_row_decoders.writeOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_row_decoders.searchOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_bit_mux_predecoder_drivers = uca->bank.mat.b_mux_predec->driver_power;
    ptr_array->power_bit_mux_predecoder_drivers.readOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_bit_mux_predecoder_drivers.writeOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_bit_mux_predecoder_drivers.searchOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_bit_mux_predecoder_blocks  = uca->bank.mat.b_mux_predec->block_power;
    ptr_array->power_bit_mux_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_bit_mux_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_bit_mux_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_bit_mux_decoders = uca->bank.mat.power_bit_mux_decoders;
    ptr_array->power_bit_mux_decoders.readOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_bit_mux_decoders.writeOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_bit_mux_decoders.searchOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_senseamp_mux_lev_1_predecoder_drivers = uca->bank.mat.sa_mux_lev_1_predec->driver_power;
    ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .readOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .writeOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .searchOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_senseamp_mux_lev_1_predecoder_blocks = uca->bank.mat.sa_mux_lev_1_predec->block_power;
    ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_senseamp_mux_lev_1_decoders = uca->bank.mat.power_sa_mux_lev_1_decoders;
    ptr_array->power_senseamp_mux_lev_1_decoders.readOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_senseamp_mux_lev_1_decoders.writeOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_senseamp_mux_lev_1_decoders.searchOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_senseamp_mux_lev_2_predecoder_drivers = uca->bank.mat.sa_mux_lev_2_predec->driver_power;
    ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.writeOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.searchOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_senseamp_mux_lev_2_predecoder_blocks = uca->bank.mat.sa_mux_lev_2_predec->block_power;
    ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_senseamp_mux_lev_2_decoders = uca->bank.mat.power_sa_mux_lev_2_decoders;
    ptr_array->power_senseamp_mux_lev_2_decoders .readOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_senseamp_mux_lev_2_decoders .writeOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_senseamp_mux_lev_2_decoders .searchOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_bitlines = uca->bank.mat.power_bitline;
    ptr_array->power_bitlines.readOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_bitlines.writeOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_bitlines.searchOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_sense_amps = uca->bank.mat.power_sa;
    ptr_array->power_sense_amps.readOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_sense_amps.writeOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_sense_amps.searchOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_prechg_eq_drivers = uca->bank.mat.power_bl_precharge_eq_drv;
    ptr_array->power_prechg_eq_drivers.readOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_prechg_eq_drivers.writeOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_prechg_eq_drivers.searchOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_output_drivers_at_subarray = uca->bank.mat.power_subarray_out_drv;
    ptr_array->power_output_drivers_at_subarray.readOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_output_drivers_at_subarray.writeOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_output_drivers_at_subarray.searchOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_comparators = uca->bank.mat.power_comparator;
    ptr_array->power_comparators.readOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_comparators.writeOp.dynamic *= num_act_mats_hor_dir;
    ptr_array->power_comparators.searchOp.dynamic *= num_act_mats_hor_dir;
 //    cout <<  "  num of mats: " << dyn_p.num_mats << endl;
    if (is_fa || pure_cam)
    {
    ptr_array->power_htree_in_search = uca->bank.htree_in_search->power;
 //    cout<<"power_htree_in_search"<<uca->bank.htree_in_search->power.readOp.leakage<<endl;
    ptr_array->power_htree_out_search = uca->bank.htree_out_search->power;
 //    cout<<"power_htree_out_search"<<uca->bank.htree_out_search->power.readOp.leakage<<endl;
    ptr_array->power_searchline = uca->bank.mat.power_searchline;
 //    cout<<"power_searchlineh"<<uca->bank.mat.power_searchline.readOp.leakage<<endl;
    ptr_array->power_searchline.searchOp.dynamic *= num_mats;
    ptr_array->power_searchline_precharge = uca->bank.mat.power_searchline_precharge;
    ptr_array->power_searchline_precharge.searchOp.dynamic *= num_mats;
    ptr_array->power_matchlines = uca->bank.mat.power_matchline;
    ptr_array->power_matchlines.searchOp.dynamic *= num_mats;
    ptr_array->power_matchline_precharge = uca->bank.mat.power_matchline_precharge;
    ptr_array->power_matchline_precharge.searchOp.dynamic *= num_mats;
    ptr_array->power_matchline_to_wordline_drv = uca->bank.mat.power_ml_to_ram_wl_drv;
 //    cout<<"power_matchline.searchOp.leakage"<<uca->bank.mat.power_matchline.searchOp.leakage<<endl;
    }
    ptr_array->activate_energy = uca->activate_energy;
    ptr_array->read_energy = uca->read_energy;
    ptr_array->write_energy = uca->write_energy;
    ptr_array->precharge_energy = uca->precharge_energy;
    ptr_array->refresh_power = uca->refresh_power;
    ptr_array->leak_power_subbank_closed_page = uca->leak_power_subbank_closed_page;
    ptr_array->leak_power_subbank_open_page = uca->leak_power_subbank_open_page;
    ptr_array->leak_power_request_and_reply_networks = uca->leak_power_request_and_reply_networks;
    ptr_array->precharge_delay = uca->precharge_delay;
 //      cout<<"power_matchline.searchOp.leakage"<<uca->bank.mat.<<endl;
 //
 //    if (!(is_fa || pure_cam))
 //    {
 //     cout <<  "  num of cols: " << dyn_p.num_c_subarray << endl;
 //    }
 //    else if (is_fa)
 //    {
 //  	  cout <<  "  num of cols: " << dyn_p.tag_num_c_subarray+ dyn_p.data_num_c_subarray<< endl;
 //    } else
 //  	  cout <<  "  num of cols: " << dyn_p.tag_num_c_subarray<< endl;
 //      cout <<  uca->bank.mat.subarray.get_total_cell_area()<<endl;
  }
  delete uca;
  return true;
 }
 bool check_uca_org(uca_org_t & u, min_values_t *minval)
 {
  if (((u.access_time - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev) {
    return false;
  }
  if (((u.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 >
      g_ip->dynamic_power_dev) {
    return false;
  }
  if (((u.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 >
      g_ip->leakage_power_dev) {
    return false;
  }
  if (((u.cycle_time - minval->min_cyc)/minval->min_cyc)*100 >
      g_ip->cycle_time_dev) {
    return false;
  }
  if (((u.area - minval->min_area)/minval->min_area)*100 >
      g_ip->area_dev) {
    return false;
  }
  return true;
 }
 bool check_mem_org(mem_array & u, const min_values_t *minval)
 {
  if (((u.access_time - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev) {
    return false;
  }
  if (((u.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 >
      g_ip->dynamic_power_dev) {
    return false;
  }
  if (((u.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 >
      g_ip->leakage_power_dev) {
    return false;
  }
  if (((u.cycle_time - minval->min_cyc)/minval->min_cyc)*100 >
      g_ip->cycle_time_dev) {
    return false;
  }
  if (((u.area - minval->min_area)/minval->min_area)*100 >
      g_ip->area_dev) {
    return false;
  }
  return true;
 }
 void find_optimal_uca(uca_org_t *res, min_values_t * minval, list<uca_org_t> & ulist)
 {
  double cost = 0;
  double min_cost = BIGNUM;
  float d, a, dp, lp, c;
  dp = g_ip->dynamic_power_wt;
  lp = g_ip->leakage_power_wt;
  a  = g_ip->area_wt;
  d  = g_ip->delay_wt;
  c  = g_ip->cycle_time_wt;
  if (ulist.empty() == true)
  {
    cout << "ERROR: no valid cache organizations found" << endl;
    exit(0);
  }
  for (list<uca_org_t>::iterator niter = ulist.begin(); niter != ulist.end(); niter++)
  {
    if (g_ip->ed == 1)
    {
      cost = ((niter)->access_time/minval->min_delay) * ((niter)->power.readOp.dynamic/minval->min_dyn);
      if (min_cost > cost)
      {
        min_cost = cost;
        *res = (*(niter));
      }
    }
    else if (g_ip->ed == 2)
    {
      cost = ((niter)->access_time/minval->min_delay)*
             ((niter)->access_time/minval->min_delay)*
             ((niter)->power.readOp.dynamic/minval->min_dyn);
      if (min_cost > cost)
      {
        min_cost = cost;
        *res = (*(niter));
      }
    }
    else
    {
      /*
       * check whether the current organization
       * meets the input deviation constraints
       */
      bool v = check_uca_org(*niter, minval);
      //if (minval->min_leakage == 0) minval->min_leakage = 0.1; //FIXME remove this after leakage modeling
      if (v)
      {
        cost = (d  * ((niter)->access_time/minval->min_delay) +
                c  * ((niter)->cycle_time/minval->min_cyc) +
                dp * ((niter)->power.readOp.dynamic/minval->min_dyn) +
                lp * ((niter)->power.readOp.leakage/minval->min_leakage) +
                a  * ((niter)->area/minval->min_area));
        //fprintf(stderr, "cost = %g\n", cost);
        if (min_cost > cost) {
          min_cost = cost;
          *res = (*(niter));
          niter = ulist.erase(niter);
          if (niter!=ulist.begin())
                  niter--;
        }
      }
      else {
        niter = ulist.erase(niter);
        if (niter!=ulist.begin())
                niter--;
      }
    }
  }
  if (min_cost == BIGNUM)
  {
    cout << "ERROR: no cache organizations met optimization criteria" << endl;
    exit(0);
  }
 }
 void filter_tag_arr(const min_values_t * min, list<mem_array *> & list)
 {
  double cost = BIGNUM;
  double cur_cost;
  double wt_delay = g_ip->delay_wt, wt_dyn = g_ip->dynamic_power_wt, wt_leakage = g_ip->leakage_power_wt, wt_cyc = g_ip->cycle_time_wt, wt_area = g_ip->area_wt;
  mem_array * res = NULL;
  if (list.empty() == true)
  {
    cout << "ERROR: no valid tag organizations found" << endl;
    exit(1);
  }
  while (list.empty() != true)
  {
    bool v = check_mem_org(*list.back(), min);
    if (v)
    {
      cur_cost = wt_delay   * (list.back()->access_time/min->min_delay) +
        wt_dyn     * (list.back()->power.readOp.dynamic/min->min_dyn) +
        wt_leakage * (list.back()->power.readOp.leakage/min->min_leakage) +
        wt_area    * (list.back()->area/min->min_area) +
        wt_cyc     * (list.back()->cycle_time/min->min_cyc);
    }
    else
    {
      cur_cost = BIGNUM;
    }
    if (cur_cost < cost)
    {
      if (res != NULL)
      {
        delete res;
      }
      cost = cur_cost;
      res  = list.back();
    }
    else
    {
      delete list.back();
    }
    list.pop_back();
  }
  if(!res)
  {
    cout << "ERROR: no valid tag organizations found" << endl;
    exit(0);
  }
  list.push_back(res);
 }
 void filter_data_arr(list<mem_array *> & curr_list)
 {
  if (curr_list.empty() == true)
  {
    cout << "ERROR: no valid data array organizations found" << endl;
    exit(1);
  }
  list<mem_array *>::iterator iter;
  for (iter = curr_list.begin(); iter != curr_list.end(); ++iter)
  {
    mem_array * m = *iter;
    if (m == NULL) exit(1);
    if(((m->access_time - m->arr_min->min_delay)/m->arr_min->min_delay > 0.5) &&
       ((m->power.readOp.dynamic - m->arr_min->min_dyn)/m->arr_min->min_dyn > 0.5))
    {
      delete m;
      iter = curr_list.erase(iter);
      iter --;
    }
  }
 }
 /*
 * Performs exhaustive search across different sub-array sizes,
 * wire types and aspect ratios to find an optimal UCA organization
 * 1. First different valid tag array organizations are calculated
 *    and stored in tag_arr array
 * 2. The exhaustive search is repeated to find valid data array
 *    organizations and stored in data_arr array
 * 3. Cache area, delay, power, and cycle time for different
 *    cache organizations are calculated based on the
 *    above results
 * 4. Cache model with least cost is picked from sol_list
 */
 void solve(uca_org_t *fin_res)
 {
  bool   is_dram  = false;
  int    pure_ram = g_ip->pure_ram;
  bool   pure_cam = g_ip->pure_cam;
  init_tech_params(g_ip->F_sz_um, false);
  list<mem_array *> tag_arr (0);
  list<mem_array *> data_arr(0);
  list<mem_array *>::iterator miter;
  list<uca_org_t> sol_list(1, uca_org_t());
  fin_res->tag_array.access_time = 0;
  fin_res->tag_array.Ndwl = 0;
  fin_res->tag_array.Ndbl = 0;
  fin_res->tag_array.Nspd = 0;
  fin_res->tag_array.deg_bl_muxing = 0;
  fin_res->tag_array.Ndsam_lev_1 = 0;
  fin_res->tag_array.Ndsam_lev_2 = 0;
  // distribute calculate_time() execution to multiple threads
  calc_time_mt_wrapper_struct * calc_array = new calc_time_mt_wrapper_struct[nthreads];
  pthread_t threads[nthreads];
  for (uint32_t t = 0; t < nthreads; t++)
  {
    calc_array[t].tid         = t;
    calc_array[t].pure_ram    = pure_ram;
    calc_array[t].pure_cam    = pure_cam;
    calc_array[t].data_res    = new min_values_t();
    calc_array[t].tag_res     = new min_values_t();
  }
  bool     is_tag;
  uint32_t ram_cell_tech_type;
  // If it's a cache, first calculate the area, delay and power for all tag array partitions.
  if (!(pure_ram||pure_cam||g_ip->fully_assoc))
  { //cache
    is_tag              = true;
    ram_cell_tech_type  = g_ip->tag_arr_ram_cell_tech_type;
    is_dram             = ((ram_cell_tech_type == lp_dram) || (ram_cell_tech_type == comm_dram));
    init_tech_params(g_ip->F_sz_um, is_tag);
    for (uint32_t t = 0; t < nthreads; t++)
    {
      calc_array[t].is_tag      = is_tag;
      calc_array[t].is_main_mem = false;
      calc_array[t].Nspd_min    = 0.125;
      pthread_create(&threads[t], NULL, calc_time_mt_wrapper, (void *)(&(calc_array[t])));
    }
    for (uint32_t t = 0; t < nthreads; t++)
    {
      pthread_join(threads[t], NULL);
    }
    for (uint32_t t = 0; t < nthreads; t++)
    {
      calc_array[t].data_arr.sort(mem_array::lt);
      data_arr.merge(calc_array[t].data_arr, mem_array::lt);
      calc_array[t].tag_arr.sort(mem_array::lt);
      tag_arr.merge(calc_array[t].tag_arr, mem_array::lt);
    }
  }
  // calculate the area, delay and power for all data array partitions (for cache or plain RAM).
 //  if (!g_ip->fully_assoc)
 // {//in the new cacti, cam, fully_associative cache are processed as single array in the data portion
    is_tag              = false;
    ram_cell_tech_type  = g_ip->data_arr_ram_cell_tech_type;
    is_dram             = ((ram_cell_tech_type == lp_dram) || (ram_cell_tech_type == comm_dram));
    init_tech_params(g_ip->F_sz_um, is_tag);
    for (uint32_t t = 0; t < nthreads; t++)
    {
      calc_array[t].is_tag      = is_tag;
      calc_array[t].is_main_mem = g_ip->is_main_mem;
      if (!(pure_cam||g_ip->fully_assoc))
      {
          calc_array[t].Nspd_min    = (double)(g_ip->out_w)/(double)(g_ip->block_sz*8);
      }
      else
      {
          calc_array[t].Nspd_min    = 1;
      }
      pthread_create(&threads[t], NULL, calc_time_mt_wrapper, (void *)(&(calc_array[t])));
    }
    for (uint32_t t = 0; t < nthreads; t++)
    {
      pthread_join(threads[t], NULL);
    }
    data_arr.clear();
    for (uint32_t t = 0; t < nthreads; t++)
    {
      calc_array[t].data_arr.sort(mem_array::lt);
      data_arr.merge(calc_array[t].data_arr, mem_array::lt);
    }
 //  }
  min_values_t * d_min = new min_values_t();
  min_values_t * t_min = new min_values_t();
  min_values_t * cache_min = new min_values_t();
  for (uint32_t t = 0; t < nthreads; t++)
  {
    d_min->update_min_values(calc_array[t].data_res);
    t_min->update_min_values(calc_array[t].tag_res);
  }
  for (miter = data_arr.begin(); miter != data_arr.end(); miter++)
  {
    (*miter)->arr_min = d_min;
  }
  //cout << data_arr.size() << "\t" << tag_arr.size() <<" before\n";
  filter_data_arr(data_arr);
  if(!(pure_ram||pure_cam||g_ip->fully_assoc))
  {
    filter_tag_arr(t_min, tag_arr);
  }
  //cout << data_arr.size() << "\t" << tag_arr.size() <<" after\n";
  if (pure_ram||pure_cam||g_ip->fully_assoc)
  {
    for (miter = data_arr.begin(); miter != data_arr.end(); miter++)
    {
      uca_org_t & curr_org  = sol_list.back();
      curr_org.tag_array2  = NULL;
      curr_org.data_array2 = (*miter);
      curr_org.find_delay();
      curr_org.find_energy();
      curr_org.find_area();
      curr_org.find_cyc();
      //update min values for the entire cache
      cache_min->update_min_values(curr_org);
      sol_list.push_back(uca_org_t());
    }
  }
  else
  {
    while (tag_arr.empty() != true)
    {
      mem_array * arr_temp = (tag_arr.back());
      //delete tag_arr.back();
      tag_arr.pop_back();
      for (miter = data_arr.begin(); miter != data_arr.end(); miter++)
      {
        uca_org_t & curr_org  = sol_list.back();
        curr_org.tag_array2  = arr_temp;
        curr_org.data_array2 = (*miter);
        curr_org.find_delay();
        curr_org.find_energy();
        curr_org.find_area();
        curr_org.find_cyc();
        //update min values for the entire cache
        cache_min->update_min_values(curr_org);
        sol_list.push_back(uca_org_t());
      }
    }
  }
  sol_list.pop_back();
  find_optimal_uca(fin_res, cache_min, sol_list);
  sol_list.clear();
  for (miter = data_arr.begin(); miter != data_arr.end(); ++miter)
  {
    if (*miter != fin_res->data_array2)
    {
      delete *miter;
    }
  }
  data_arr.clear();
  for (uint32_t t = 0; t < nthreads; t++)
  {
    delete calc_array[t].data_res;
    delete calc_array[t].tag_res;
  }
  delete [] calc_array;
  delete cache_min;
  delete d_min;
  delete t_min;
 }
 void update(uca_org_t *fin_res)
 {
  if(fin_res->tag_array2)
  {
    init_tech_params(g_ip->F_sz_um,true);
    DynamicParameter tag_arr_dyn_p(true, g_ip->pure_ram, g_ip->pure_cam, fin_res->tag_array2->Nspd, fin_res->tag_array2->Ndwl, fin_res->tag_array2->Ndbl, fin_res->tag_array2->Ndcm, fin_res->tag_array2->Ndsam_lev_1, fin_res->tag_array2->Ndsam_lev_2, g_ip->is_main_mem);
    if(tag_arr_dyn_p.is_valid)
    {
      UCA * tag_arr = new UCA(tag_arr_dyn_p);
      fin_res->tag_array2->power = tag_arr->power;
    }
    else
    {
      cout << "ERROR: Cannot retrieve array structure for leakage feedback" << endl;
      exit(1);
    }
  }
  init_tech_params(g_ip->F_sz_um,false);
  DynamicParameter data_arr_dyn_p(false, g_ip->pure_ram, g_ip->pure_cam, fin_res->data_array2->Nspd, fin_res->data_array2->Ndwl, fin_res->data_array2->Ndbl, fin_res->data_array2->Ndcm, fin_res->data_array2->Ndsam_lev_1, fin_res->data_array2->Ndsam_lev_2, g_ip->is_main_mem);
  if(data_arr_dyn_p.is_valid)
  {
    UCA * data_arr = new UCA(data_arr_dyn_p);
    fin_res->data_array2->power = data_arr->power;
  }
  else
  {
    cout << "ERROR: Cannot retrieve array structure for leakage feedback" << endl;
    exit(1);
  }
  fin_res->find_energy();
 }
--- a/ext/mcpat/cacti/Ucache.h
+++ b/ext/mcpat/cacti/Ucache.h
@ -0,0 +1,115 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __UCACHE_H__
 #define __UCACHE_H__
 #include <list>
 #include "area.h"
 #include "nuca.h"
 #include "router.h"
 class min_values_t
 {
  public:
    double min_delay;
    double min_dyn;
    double min_leakage;
    double min_area;
    double min_cyc;
    min_values_t() : min_delay(BIGNUM), min_dyn(BIGNUM), min_leakage(BIGNUM), min_area(BIGNUM), min_cyc(BIGNUM) { }
    void update_min_values(const min_values_t * val);
    void update_min_values(const uca_org_t & res);
    void update_min_values(const nuca_org_t * res);
    void update_min_values(const mem_array * res);
 };
 struct solution
 {
  int    tag_array_index;
  int    data_array_index;
  list<mem_array *>::iterator tag_array_iter;
  list<mem_array *>::iterator data_array_iter;
  double access_time;
  double cycle_time;
  double area;
  double efficiency;
  powerDef total_power;
 };
 bool calculate_time(
    bool is_tag,
    int pure_ram,
    bool pure_cam,
    double Nspd,
    unsigned int Ndwl,
    unsigned int Ndbl,
    unsigned int Ndcm,
    unsigned int Ndsam_lev_1,
    unsigned int Ndsam_lev_2,
    mem_array *ptr_array,
    int flag_results_populate,
    results_mem_array *ptr_results,
    uca_org_t *ptr_fin_res,
    bool is_main_mem);
 void update(uca_org_t *fin_res);
 void solve(uca_org_t *fin_res);
 void init_tech_params(double tech, bool is_tag);
 struct calc_time_mt_wrapper_struct
 {
  uint32_t tid;
  bool     is_tag;
  bool     pure_ram;
  bool     pure_cam;
  bool     is_main_mem;
  double   Nspd_min;
  min_values_t * data_res;
  min_values_t * tag_res;
  list<mem_array *> data_arr;
  list<mem_array *> tag_arr;
 };
 void *calc_time_mt_wrapper(void * void_obj);
 #endif
--- a/ext/mcpat/cacti/arbiter.cc
+++ b/ext/mcpat/cacti/arbiter.cc
@ -0,0 +1,130 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include "arbiter.h"
 Arbiter::Arbiter(
    double n_req,
    double flit_size_,
    double output_len,
    TechnologyParameter::DeviceType *dt
    ):R(n_req), flit_size(flit_size_),
    o_len (output_len), deviceType(dt)
 {
  min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
  Vdd = dt->Vdd;
  double technology = g_ip->F_sz_um;
  NTn1 = 13.5*technology/2;
  PTn1 = 76*technology/2;
  NTn2 = 13.5*technology/2;
  PTn2 = 76*technology/2;
  NTi = 12.5*technology/2;
  PTi = 25*technology/2;
  NTtr = 10*technology/2; /*Transmission gate's nmos tr. length*/
  PTtr = 20*technology/2; /* pmos tr. length*/
 }
 Arbiter::~Arbiter(){}
 double
 Arbiter::arb_req() {
  double temp = ((R-1)*(2*gate_C(NTn1, 0)+gate_C(PTn1, 0)) + 2*gate_C(NTn2, 0) +
      gate_C(PTn2, 0) + gate_C(NTi, 0) + gate_C(PTi, 0) +
      drain_C_(NTi, 0, 1, 1, g_tp.cell_h_def) + drain_C_(PTi, 1, 1, 1, g_tp.cell_h_def));
  return temp;
 }
 double
 Arbiter::arb_pri() {
  double temp = 2*(2*gate_C(NTn1, 0)+gate_C(PTn1, 0)); /* switching capacitance
                                                 of flip-flop is ignored */
  return temp;
 }
 double
 Arbiter::arb_grant() {
  double temp = drain_C_(NTn1, 0, 1, 1, g_tp.cell_h_def)*2 + drain_C_(PTn1, 1, 1, 1, g_tp.cell_h_def) + crossbar_ctrline();
  return temp;
 }
 double
 Arbiter::arb_int() {
  double temp  =  (drain_C_(NTn1, 0, 1, 1, g_tp.cell_h_def)*2 + drain_C_(PTn1, 1, 1, 1, g_tp.cell_h_def) +
      2*gate_C(NTn2, 0) + gate_C(PTn2, 0));
  return temp;
 }
 void
 Arbiter::compute_power() {
  power.readOp.dynamic =  (R*arb_req()*Vdd*Vdd/2 + R*arb_pri()*Vdd*Vdd/2 +
      arb_grant()*Vdd*Vdd + arb_int()*0.5*Vdd*Vdd);
  double nor1_leak = cmos_Isub_leakage(g_tp.min_w_nmos_*NTn1*2, min_w_pmos * PTn1*2, 2, nor);
  double nor2_leak = cmos_Isub_leakage(g_tp.min_w_nmos_*NTn2*R, min_w_pmos * PTn2*R, 2, nor);
  double not_leak = cmos_Isub_leakage(g_tp.min_w_nmos_*NTi, min_w_pmos * PTi, 1, inv);
  double nor1_leak_gate = cmos_Ig_leakage(g_tp.min_w_nmos_*NTn1*2, min_w_pmos * PTn1*2, 2, nor);
  double nor2_leak_gate = cmos_Ig_leakage(g_tp.min_w_nmos_*NTn2*R, min_w_pmos * PTn2*R, 2, nor);
  double not_leak_gate  = cmos_Ig_leakage(g_tp.min_w_nmos_*NTi, min_w_pmos * PTi, 1, inv);
  power.readOp.leakage = (nor1_leak + nor2_leak + not_leak)*Vdd; //FIXME include priority table leakage
  power.readOp.gate_leakage = nor1_leak_gate*Vdd + nor2_leak_gate*Vdd + not_leak_gate*Vdd;
 }
 double //wire cap with triple spacing
 Arbiter::Cw3(double length) {
  Wire wc(g_ip->wt, length, 1, 3, 3);
  double temp = (wc.wire_cap(length,true));
  return temp;
 }
 double
 Arbiter::crossbar_ctrline() {
  double temp = (Cw3(o_len * 1e-6 /* m */) +
      drain_C_(NTi, 0, 1, 1, g_tp.cell_h_def) + drain_C_(PTi, 1, 1, 1, g_tp.cell_h_def) +
      gate_C(NTi, 0) + gate_C(PTi, 0));
  return temp;
 }
 double
 Arbiter::transmission_buf_ctrcap() {
  double temp = gate_C(NTtr, 0)+gate_C(PTtr, 0);
  return temp;
 }
 void Arbiter::print_arbiter()
 {
  cout << "\nArbiter Stats ("   << R << " input arbiter" << ")\n\n";
  cout << "Flit size        : " << flit_size << " bits" << endl;
  cout << "Dynamic Power    : " << power.readOp.dynamic*1e9 << " (nJ)" << endl;
  cout << "Leakage Power    : " << power.readOp.leakage*1e3 << " (mW)" << endl;
 }
--- a/ext/mcpat/cacti/arbiter.h
+++ b/ext/mcpat/cacti/arbiter.h
@ -0,0 +1,79 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __ARBITER__
 #define __ARBITER__
 #include <assert.h>
 #include <iostream>
 #include "basic_circuit.h"
 #include "cacti_interface.h"
 #include "component.h"
 #include "mat.h"
 #include "parameter.h"
 #include "wire.h"
 class Arbiter : public Component
 {
  public:
    Arbiter(
      double Req,
      double flit_sz,
      double output_len,
      TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
    ~Arbiter();
    void print_arbiter();
    double arb_req();
    double arb_pri();
    double arb_grant();
    double arb_int();
    void compute_power();
    double Cw3(double len);
    double crossbar_ctrline();
    double transmission_buf_ctrcap();
  private:
    double NTn1, PTn1, NTn2, PTn2, R, PTi, NTi;
    double flit_size;
    double NTtr, PTtr;
    double o_len;
    TechnologyParameter::DeviceType *deviceType;
    double TriS1, TriS2;
    double min_w_pmos, Vdd;
 };
 #endif
--- a/ext/mcpat/cacti/area.cc
+++ b/ext/mcpat/cacti/area.cc
@ -0,0 +1,47 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <cassert>
 #include <cmath>
 #include <iostream>
 #include "area.h"
 #include "basic_circuit.h"
 #include "component.h"
 #include "decoder.h"
 #include "parameter.h"
 using namespace std;
--- a/ext/mcpat/cacti/area.h
+++ b/ext/mcpat/cacti/area.h
@ -0,0 +1,71 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __AREA_H__
 #define __AREA_H__
 #include "basic_circuit.h"
 #include "cacti_interface.h"
 using namespace std;
 class Area
 {
 public:
  double w;
  double h;
  Area():w(0), h(0), area(0) { }
  double get_w() const { return w; }
  double get_h() const { return h; }
  double get_area() const
  {
    if (w == 0 && h == 0)
    {
      return area;
    }
    else
    {
      return w*h;
    }
  }
  void set_w(double w_) { w = w_; }
  void set_h(double h_) { h = h_; }
  void set_area(double a_) { area = a_; }
 private:
  double area;
 };
 #endif
--- a/ext/mcpat/cacti/bank.cc
+++ b/ext/mcpat/cacti/bank.cc
@ -0,0 +1,198 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <iostream>
 #include "bank.h"
 Bank::Bank(const DynamicParameter & dyn_p):
  dp(dyn_p), mat(dp),
  num_addr_b_mat(dyn_p.number_addr_bits_mat),
  num_mats_hor_dir(dyn_p.num_mats_h_dir), num_mats_ver_dir(dyn_p.num_mats_v_dir)
 {
  int RWP;
  int ERP;
  int EWP;
  int SCHP;
  if (dp.use_inp_params)
  {
    RWP  = dp.num_rw_ports;
    ERP  = dp.num_rd_ports;
    EWP  = dp.num_wr_ports;
    SCHP = dp.num_search_ports;
  }
  else
  {
    RWP  = g_ip->num_rw_ports;
    ERP  = g_ip->num_rd_ports;
    EWP  = g_ip->num_wr_ports;
    SCHP = g_ip->num_search_ports;
  }
  int total_addrbits = (dp.number_addr_bits_mat + dp.number_subbanks_decode)*(RWP+ERP+EWP);
  int datainbits     = dp.num_di_b_bank_per_port * (RWP + EWP);
  int dataoutbits    = dp.num_do_b_bank_per_port * (RWP + ERP);
  int searchinbits;
  int searchoutbits;
  if (dp.fully_assoc || dp.pure_cam)
  {
          datainbits   = dp.num_di_b_bank_per_port * (RWP + EWP);
          dataoutbits  = dp.num_do_b_bank_per_port * (RWP + ERP);
          searchinbits    = dp.num_si_b_bank_per_port * SCHP;
          searchoutbits   = dp.num_so_b_bank_per_port * SCHP;
  }
  if (!(dp.fully_assoc || dp.pure_cam))
    {
    if (g_ip->fast_access && dp.is_tag == false)
    {
        dataoutbits *= g_ip->data_assoc;
    }
  htree_in_add   = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
      total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Add_htree);
  htree_in_data  = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
      total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_in_htree);
  htree_out_data = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
      total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree);
 //  htree_out_data = new Htree2 (g_ip->wt,(double) 100, (double)100,
 //		  total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree);
  area.w = htree_in_data->area.w;
  area.h = htree_in_data->area.h;
  }
  else
  {
          htree_in_add   = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
                          total_addrbits, datainbits, searchinbits,dataoutbits,searchoutbits, num_mats_ver_dir*2, num_mats_hor_dir*2, Add_htree);
          htree_in_data  = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
                          total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_in_htree);
          htree_out_data = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
                          total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits,num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree);
          htree_in_search  = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
                          total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_in_htree,true, true);
          htree_out_search = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
                          total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits,num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree,true);
      area.w = htree_in_data->area.w;
      area.h = htree_in_data->area.h;
  }
  num_addr_b_row_dec = _log2(mat.subarray.num_rows);
  num_addr_b_routed_to_mat_for_act = num_addr_b_row_dec;
  num_addr_b_routed_to_mat_for_rd_or_wr = num_addr_b_mat - num_addr_b_row_dec;
 }
 Bank::~Bank()
 {
  delete htree_in_add;
  delete htree_out_data;
  delete htree_in_data;
  if (dp.fully_assoc || dp.pure_cam)
  {
          delete htree_in_search;
          delete htree_out_search;
  }
 }
 double Bank::compute_delays(double inrisetime)
 {
  return mat.compute_delays(inrisetime);
 }
 void Bank::compute_power_energy()
 {
  mat.compute_power_energy();
  if (!(dp.fully_assoc || dp.pure_cam))
  {
          power.readOp.dynamic += mat.power.readOp.dynamic * dp.num_act_mats_hor_dir;
          power.readOp.leakage += mat.power.readOp.leakage * dp.num_mats;
          power.readOp.gate_leakage += mat.power.readOp.gate_leakage * dp.num_mats;
          power.readOp.dynamic += htree_in_add->power.readOp.dynamic;
          power.readOp.dynamic += htree_out_data->power.readOp.dynamic;
          power.readOp.leakage += htree_in_add->power.readOp.leakage;
          power.readOp.leakage += htree_in_data->power.readOp.leakage;
          power.readOp.leakage += htree_out_data->power.readOp.leakage;
          power.readOp.gate_leakage += htree_in_add->power.readOp.gate_leakage;
          power.readOp.gate_leakage += htree_in_data->power.readOp.gate_leakage;
          power.readOp.gate_leakage += htree_out_data->power.readOp.gate_leakage;
  }
  else
  {
          power.readOp.dynamic += mat.power.readOp.dynamic ;//for fa and cam num_act_mats_hor_dir is 1 for plain r/w
          power.readOp.leakage += mat.power.readOp.leakage * dp.num_mats;
          power.readOp.gate_leakage += mat.power.readOp.gate_leakage * dp.num_mats;
          power.searchOp.dynamic += mat.power.searchOp.dynamic * dp.num_mats;
          power.searchOp.dynamic += mat.power_bl_precharge_eq_drv.searchOp.dynamic +
                                        mat.power_sa.searchOp.dynamic +
                                        mat.power_bitline.searchOp.dynamic +
                                        mat.power_subarray_out_drv.searchOp.dynamic+
                                        mat.ml_to_ram_wl_drv->power.readOp.dynamic;
          power.readOp.dynamic += htree_in_add->power.readOp.dynamic;
          power.readOp.dynamic += htree_out_data->power.readOp.dynamic;
          power.searchOp.dynamic += htree_in_search->power.searchOp.dynamic;
          power.searchOp.dynamic += htree_out_search->power.searchOp.dynamic;
          power.readOp.leakage += htree_in_add->power.readOp.leakage;
          power.readOp.leakage += htree_in_data->power.readOp.leakage;
          power.readOp.leakage += htree_out_data->power.readOp.leakage;
          power.readOp.leakage += htree_in_search->power.readOp.leakage;
          power.readOp.leakage += htree_out_search->power.readOp.leakage;
          power.readOp.gate_leakage += htree_in_add->power.readOp.gate_leakage;
          power.readOp.gate_leakage += htree_in_data->power.readOp.gate_leakage;
          power.readOp.gate_leakage += htree_out_data->power.readOp.gate_leakage;
          power.readOp.gate_leakage += htree_in_search->power.readOp.gate_leakage;
          power.readOp.gate_leakage += htree_out_search->power.readOp.gate_leakage;
  }
 }
--- a/ext/mcpat/cacti/bank.h
+++ b/ext/mcpat/cacti/bank.h
@ -0,0 +1,69 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __BANK_H__
 #define __BANK_H__
 #include "component.h"
 #include "decoder.h"
 #include "htree2.h"
 #include "mat.h"
 class Bank : public Component
 {
  public:
    Bank(const DynamicParameter & dyn_p);
    ~Bank();
    double compute_delays(double inrisetime);  // return outrisetime
    void   compute_power_energy();
    const DynamicParameter & dp;
    Mat   mat;
    Htree2 *htree_in_add;
    Htree2 *htree_in_data;
    Htree2 *htree_out_data;
    Htree2 *htree_in_search;
    Htree2 *htree_out_search;
    int  num_addr_b_mat;
    int  num_mats_hor_dir;
    int  num_mats_ver_dir;
    int  num_addr_b_row_dec;
    int  num_addr_b_routed_to_mat_for_act;
    int  num_addr_b_routed_to_mat_for_rd_or_wr;
 };
 #endif
--- a/ext/mcpat/cacti/basic_circuit.cc
+++ b/ext/mcpat/cacti/basic_circuit.cc
@ -0,0 +1,829 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <cassert>
 #include <cmath>
 #include <iostream>
 #include "basic_circuit.h"
 #include "parameter.h"
 uint32_t _log2(uint64_t num)
 {
  uint32_t log2 = 0;
  if (num == 0)
  {
    std::cerr << "log0?" << std::endl;
    exit(1);
  }
  while (num > 1)
  {
    num = (num >> 1);
    log2++;
  }
  return log2;
 }
 bool is_pow2(int64_t val)
 {
  if (val <= 0)
  {
    return false;
  }
  else if (val == 1)
  {
    return true;
  }
  else
  {
    return (_log2(val) != _log2(val-1));
  }
 }
 int powers (int base, int n)
 {
  int i, p;
  p = 1;
  for (i = 1; i <= n; ++i)
    p *= base;
  return p;
 }
 /*----------------------------------------------------------------------*/
 double logtwo (double x)
 {
  assert(x > 0);
  return ((double) (log (x) / log (2.0)));
 }
 /*----------------------------------------------------------------------*/
 double gate_C(
    double width,
    double wirelength,
    bool   _is_dram,
    bool   _is_cell,
    bool   _is_wl_tr)
 {
  const TechnologyParameter::DeviceType * dt;
  if (_is_dram && _is_cell)
  {
    dt = &g_tp.dram_acc;   //DRAM cell access transistor
  }
  else if (_is_dram && _is_wl_tr)
  {
    dt = &g_tp.dram_wl;    //DRAM wordline transistor
  }
  else if (!_is_dram && _is_cell)
  {
    dt = &g_tp.sram_cell;  // SRAM cell access transistor
  }
  else
  {
    dt = &g_tp.peri_global;
  }
  return (dt->C_g_ideal + dt->C_overlap + 3*dt->C_fringe)*width + dt->l_phy*Cpolywire;
 }
 // returns gate capacitance in Farads
 // actually this function is the same as gate_C() now
 double gate_C_pass(
    double width,       // gate width in um (length is Lphy_periph_global)
    double wirelength,  // poly wire length going to gate in lambda
    bool   _is_dram,
    bool   _is_cell,
    bool   _is_wl_tr)
 {
  // v5.0
  const TechnologyParameter::DeviceType * dt;
  if ((_is_dram) && (_is_cell))
  {
    dt = &g_tp.dram_acc;   //DRAM cell access transistor
  }
  else if ((_is_dram) && (_is_wl_tr))
  {
    dt = &g_tp.dram_wl;    //DRAM wordline transistor
  }
  else if ((!_is_dram) && _is_cell)
  {
    dt = &g_tp.sram_cell;  // SRAM cell access transistor
  }
  else
  {
    dt = &g_tp.peri_global;
  }
  return (dt->C_g_ideal + dt->C_overlap + 3*dt->C_fringe)*width + dt->l_phy*Cpolywire;
 }
 double drain_C_(
    double width,
    int nchannel,
    int stack,
    int next_arg_thresh_folding_width_or_height_cell,
    double fold_dimension,
    bool _is_dram,
    bool _is_cell,
    bool _is_wl_tr)
 {
  double w_folded_tr;
  const  TechnologyParameter::DeviceType * dt;
  if ((_is_dram) && (_is_cell))
  {
    dt = &g_tp.dram_acc;   // DRAM cell access transistor
  }
  else if ((_is_dram) && (_is_wl_tr))
  {
    dt = &g_tp.dram_wl;    // DRAM wordline transistor
  }
  else if ((!_is_dram) && _is_cell)
  {
    dt = &g_tp.sram_cell;  // SRAM cell access transistor
  }
  else
  {
    dt = &g_tp.peri_global;
  }
  double c_junc_area = dt->C_junc;
  double c_junc_sidewall = dt->C_junc_sidewall;
  double c_fringe    = 2*dt->C_fringe;
  double c_overlap   = 2*dt->C_overlap;
  double drain_C_metal_connecting_folded_tr = 0;
  // determine the width of the transistor after folding (if it is getting folded)
  if (next_arg_thresh_folding_width_or_height_cell == 0)
  { // interpret fold_dimension as the the folding width threshold
    // i.e. the value of transistor width above which the transistor gets folded
    w_folded_tr = fold_dimension;
  }
  else
  { // interpret fold_dimension as the height of the cell that this transistor is part of.
    double h_tr_region  = fold_dimension - 2 * g_tp.HPOWERRAIL;
    // TODO : w_folded_tr must come from Component::compute_gate_area()
    double ratio_p_to_n = 2.0 / (2.0 + 1.0);
    if (nchannel)
    {
      w_folded_tr = (1 - ratio_p_to_n) * (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS);
    }
    else
    {
      w_folded_tr = ratio_p_to_n * (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS);
    }
  }
  int num_folded_tr = (int) (ceil(width / w_folded_tr));
  if (num_folded_tr < 2)
  {
    w_folded_tr = width;
  }
  double total_drain_w = (g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact) +  // only for drain
                         (stack - 1) * g_tp.spacing_poly_to_poly;
  double drain_h_for_sidewall = w_folded_tr;
  double total_drain_height_for_cap_wrt_gate = w_folded_tr + 2 * w_folded_tr * (stack - 1);
  if (num_folded_tr > 1)
  {
    total_drain_w += (num_folded_tr - 2) * (g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact) +
                     (num_folded_tr - 1) * ((stack - 1) * g_tp.spacing_poly_to_poly);
    if (num_folded_tr%2 == 0)
    {
      drain_h_for_sidewall = 0;
    }
    total_drain_height_for_cap_wrt_gate *= num_folded_tr;
    drain_C_metal_connecting_folded_tr   = g_tp.wire_local.C_per_um * total_drain_w;
  }
  double drain_C_area     = c_junc_area * total_drain_w * w_folded_tr;
  double drain_C_sidewall = c_junc_sidewall * (drain_h_for_sidewall + 2 * total_drain_w);
  double drain_C_wrt_gate = (c_fringe + c_overlap) * total_drain_height_for_cap_wrt_gate;
  return (drain_C_area + drain_C_sidewall + drain_C_wrt_gate + drain_C_metal_connecting_folded_tr);
 }
 double tr_R_on(
    double width,
    int nchannel,
    int stack,
    bool _is_dram,
    bool _is_cell,
    bool _is_wl_tr)
 {
  const TechnologyParameter::DeviceType * dt;
  if ((_is_dram) && (_is_cell))
  {
    dt = &g_tp.dram_acc;   //DRAM cell access transistor
  }
  else if ((_is_dram) && (_is_wl_tr))
  {
    dt = &g_tp.dram_wl;    //DRAM wordline transistor
  }
  else if ((!_is_dram) && _is_cell)
  {
    dt = &g_tp.sram_cell;  // SRAM cell access transistor
  }
  else
  {
    dt = &g_tp.peri_global;
  }
  double restrans = (nchannel) ? dt->R_nch_on : dt->R_pch_on;
  return (stack * restrans / width);
 }
 /* This routine operates in reverse: given a resistance, it finds
 * the transistor width that would have this R.  It is used in the
 * data wordline to estimate the wordline driver size. */
 // returns width in um
 double R_to_w(
    double res,
    int   nchannel,
    bool _is_dram,
    bool _is_cell,
    bool _is_wl_tr)
 {
  const TechnologyParameter::DeviceType * dt;
  if ((_is_dram) && (_is_cell))
  {
    dt = &g_tp.dram_acc;   //DRAM cell access transistor
  }
  else if ((_is_dram) && (_is_wl_tr))
  {
    dt = &g_tp.dram_wl;    //DRAM wordline transistor
  }
  else if ((!_is_dram) && (_is_cell))
  {
    dt = &g_tp.sram_cell;  // SRAM cell access transistor
  }
  else
  {
    dt = &g_tp.peri_global;
  }
  double restrans = (nchannel) ? dt->R_nch_on : dt->R_pch_on;
  return (restrans / res);
 }
 double pmos_to_nmos_sz_ratio(
    bool _is_dram,
    bool _is_wl_tr)
 {
  double p_to_n_sizing_ratio;
  if ((_is_dram) && (_is_wl_tr))
  { //DRAM wordline transistor
    p_to_n_sizing_ratio = g_tp.dram_wl.n_to_p_eff_curr_drv_ratio;
  }
  else
  { //DRAM or SRAM all other transistors
    p_to_n_sizing_ratio = g_tp.peri_global.n_to_p_eff_curr_drv_ratio;
  }
  return p_to_n_sizing_ratio;
 }
 // "Timing Models for MOS Circuits" by Mark Horowitz, 1984
 double horowitz(
    double inputramptime, // input rise time
    double tf,            // time constant of gate
    double vs1,           // threshold voltage
    double vs2,           // threshold voltage
    int    rise)          // whether input rises or fall
 {
  if (inputramptime == 0 && vs1 == vs2)
  {
    return tf * (vs1 < 1 ? -log(vs1) : log(vs1));
  }
  double a, b, td;
  a = inputramptime / tf;
  if (rise == RISE)
  {
    b = 0.5;
    td = tf * sqrt(log(vs1)*log(vs1) + 2*a*b*(1.0 - vs1)) + tf*(log(vs1) - log(vs2));
  }
  else
  {
    b = 0.4;
    td = tf * sqrt(log(1.0 - vs1)*log(1.0 - vs1) + 2*a*b*(vs1)) + tf*(log(1.0 - vs1) - log(1.0 - vs2));
  }
  return (td);
 }
 double cmos_Ileak(
    double nWidth,
    double pWidth,
    bool _is_dram,
    bool _is_cell,
    bool _is_wl_tr)
 {
  TechnologyParameter::DeviceType * dt;
  if ((!_is_dram)&&(_is_cell))
  { //SRAM cell access transistor
    dt = &(g_tp.sram_cell);
  }
  else if ((_is_dram)&&(_is_wl_tr))
  { //DRAM wordline transistor
    dt = &(g_tp.dram_wl);
  }
  else
  { //DRAM or SRAM all other transistors
    dt = &(g_tp.peri_global);
  }
  return nWidth*dt->I_off_n + pWidth*dt->I_off_p;
 }
 double simplified_nmos_leakage(
    double nwidth,
    bool _is_dram,
    bool _is_cell,
    bool _is_wl_tr)
 {
  TechnologyParameter::DeviceType * dt;
  if ((!_is_dram)&&(_is_cell))
  { //SRAM cell access transistor
    dt = &(g_tp.sram_cell);
  }
  else if ((_is_dram)&&(_is_wl_tr))
  { //DRAM wordline transistor
    dt = &(g_tp.dram_wl);
  }
  else
  { //DRAM or SRAM all other transistors
    dt = &(g_tp.peri_global);
  }
  return nwidth * dt->I_off_n;
 }
 int factorial(int n, int m)
 {
        int fa = m, i;
        for (i=m+1; i<=n; i++)
                fa *=i;
        return fa;
 }
 int combination(int n, int m)
 {
  int ret;
  ret = factorial(n, m+1) / factorial(n - m);
  return ret;
 }
 double simplified_pmos_leakage(
    double pwidth,
    bool _is_dram,
    bool _is_cell,
    bool _is_wl_tr)
 {
  TechnologyParameter::DeviceType * dt;
  if ((!_is_dram)&&(_is_cell))
  { //SRAM cell access transistor
    dt = &(g_tp.sram_cell);
  }
  else if ((_is_dram)&&(_is_wl_tr))
  { //DRAM wordline transistor
    dt = &(g_tp.dram_wl);
  }
  else
  { //DRAM or SRAM all other transistors
    dt = &(g_tp.peri_global);
  }
  return pwidth * dt->I_off_p;
 }
 double cmos_Ig_n(
    double nWidth,
    bool _is_dram,
    bool _is_cell,
    bool _is_wl_tr)
 {
  TechnologyParameter::DeviceType * dt;
  if ((!_is_dram)&&(_is_cell))
  { //SRAM cell access transistor
    dt = &(g_tp.sram_cell);
  }
  else if ((_is_dram)&&(_is_wl_tr))
  { //DRAM wordline transistor
    dt = &(g_tp.dram_wl);
  }
  else
  { //DRAM or SRAM all other transistors
    dt = &(g_tp.peri_global);
  }
  return nWidth*dt->I_g_on_n;
 }
 double cmos_Ig_p(
    double pWidth,
    bool _is_dram,
    bool _is_cell,
    bool _is_wl_tr)
 {
  TechnologyParameter::DeviceType * dt;
  if ((!_is_dram)&&(_is_cell))
  { //SRAM cell access transistor
    dt = &(g_tp.sram_cell);
  }
  else if ((_is_dram)&&(_is_wl_tr))
  { //DRAM wordline transistor
    dt = &(g_tp.dram_wl);
  }
  else
  { //DRAM or SRAM all other transistors
    dt = &(g_tp.peri_global);
  }
  return pWidth*dt->I_g_on_p;
 }
 double cmos_Isub_leakage(
    double nWidth,
    double pWidth,
    int    fanin,
    enum Gate_type g_type,
    bool _is_dram,
    bool _is_cell,
    bool _is_wl_tr,
    enum Half_net_topology topo)
 {
        assert (fanin>=1);
        double nmos_leak = simplified_nmos_leakage(nWidth, _is_dram, _is_cell, _is_wl_tr);
        double pmos_leak = simplified_pmos_leakage(pWidth, _is_dram, _is_cell, _is_wl_tr);
    double Isub=0;
    int    num_states;
    int    num_off_tx;
    num_states = int(pow(2.0, fanin));
    switch (g_type)
    {
    case nmos:
        if (fanin==1)
        {
                Isub = nmos_leak/num_states;
        }
        else
        {
                if (topo==parallel)
                {
                        Isub=nmos_leak*fanin/num_states; //only when all tx are off, leakage power is non-zero. The possibility of this state is 1/num_states
                }
                else
                {
                        for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) //when num_off_tx ==0 there is no leakage power
                        {
                                //Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
                                Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
                        }
                        Isub /=num_states;
                }
        }
        break;
    case pmos:
        if (fanin==1)
        {
                Isub = pmos_leak/num_states;
        }
        else
        {
                if (topo==parallel)
                {
                        Isub=pmos_leak*fanin/num_states; //only when all tx are off, leakage power is non-zero. The possibility of this state is 1/num_states
                }
                else
                {
                        for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) //when num_off_tx ==0 there is no leakage power
                        {
                                //Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
                                Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
                        }
                        Isub /=num_states;
                }
        }
        break;
    case inv:
        Isub = (nmos_leak + pmos_leak)/2;
        break;
    case nand:
        Isub += fanin*pmos_leak;//the pullup network
        for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) // the pulldown network
        {
                //Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
            Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
        }
        Isub /=num_states;
        break;
    case nor:
        for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) // the pullup network
        {
                //Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
                Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
        }
        Isub += fanin*nmos_leak;//the pulldown network
        Isub /=num_states;
        break;
    case tri:
        Isub += (nmos_leak + pmos_leak)/2;//enabled
        Isub += nmos_leak*UNI_LEAK_STACK_FACTOR; //disabled upper bound of leakage power
        Isub /=2;
        break;
    case tg:
        Isub = (nmos_leak + pmos_leak)/2;
        break;
    default:
        assert(0);
        break;
          }
    return Isub;
 }
 double cmos_Ig_leakage(
    double nWidth,
    double pWidth,
    int    fanin,
    enum Gate_type g_type,
    bool _is_dram,
    bool _is_cell,
    bool _is_wl_tr,
    enum Half_net_topology topo)
 {
        assert (fanin>=1);
                double nmos_leak = cmos_Ig_n(nWidth, _is_dram, _is_cell, _is_wl_tr);
                double pmos_leak = cmos_Ig_p(pWidth, _is_dram, _is_cell, _is_wl_tr);
            double Ig_on=0;
            int    num_states;
            int    num_on_tx;
            num_states = int(pow(2.0, fanin));
            switch (g_type)
            {
            case nmos:
                if (fanin==1)
                {
                        Ig_on = nmos_leak/num_states;
                }
                else
                {
                        if (topo==parallel)
                        {
                        for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)
                        {
                                Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx;
                        }
                        }
                        else
                        {
                                Ig_on += nmos_leak * fanin;//pull down network when all TXs are on.
                            //num_on_tx is the number of on tx
                                for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)//when num_on_tx=[1,n-1]
                                {
                                        Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;//TODO: this is a approximation now, a precise computation will be very complicated.
                                }
                                Ig_on /=num_states;
                        }
                }
                break;
            case pmos:
                if (fanin==1)
                {
                        Ig_on = pmos_leak/num_states;
                }
                else
                {
                        if (topo==parallel)
                    {
                  for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)
                  {
                          Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx;
                  }
                    }
                    else
                    {
                          Ig_on += pmos_leak * fanin;//pull down network when all TXs are on.
                      //num_on_tx is the number of on tx
                          for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)//when num_on_tx=[1,n-1]
                          {
                                  Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;//TODO: this is a approximation now, a precise computation will be very complicated.
                          }
                          Ig_on /=num_states;
                    }
                }
                break;
            case inv:
                Ig_on = (nmos_leak + pmos_leak)/2;
                break;
            case nand:
                //pull up network
                for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)//when num_on_tx=[1,n]
                {
                        Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx;
                }
                //pull down network
                Ig_on += nmos_leak * fanin;//pull down network when all TXs are on.
                //num_on_tx is the number of on tx
                for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)//when num_on_tx=[1,n-1]
                {
                        Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;//TODO: this is a approximation now, a precise computation will be very complicated.
                }
                Ig_on /=num_states;
                break;
            case nor:
                // num_on_tx is the number of on tx in pull up network
                Ig_on += pmos_leak * fanin;//pull up network when all TXs are on.
                for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)
                {
                        Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;
                }
                //pull down network
                for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)//when num_on_tx=[1,n]
                {
                        Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx;
                }
                Ig_on /=num_states;
                break;
            case tri:
                Ig_on += (2*nmos_leak + 2*pmos_leak)/2;//enabled
                Ig_on += (nmos_leak + pmos_leak)/2; //disabled upper bound of leakage power
                Ig_on /=2;
                break;
            case tg:
                Ig_on = (nmos_leak + pmos_leak)/2;
                break;
            default:
                assert(0);
                break;
                  }
            return Ig_on;
 }
 double shortcircuit_simple(
    double vt,
    double velocity_index,
    double c_in,
    double c_out,
    double w_nmos,
    double w_pmos,
    double i_on_n,
    double i_on_p,
    double i_on_n_in,
    double i_on_p_in,
    double vdd)
 {
        double p_short_circuit, p_short_circuit_discharge, p_short_circuit_charge, p_short_circuit_discharge_low, p_short_circuit_discharge_high, p_short_circuit_charge_low, p_short_circuit_charge_high; //this is actually energy
        double fo_n, fo_p, fanout, beta_ratio, vt_to_vdd_ratio;
        fo_n	= i_on_n/i_on_n_in;
        fo_p	= i_on_p/i_on_p_in;
        fanout	= c_out/c_in;
        beta_ratio = i_on_p/i_on_n;
        vt_to_vdd_ratio = vt/vdd;
        //p_short_circuit_discharge_low 	= 10/3*(pow(0.5-vt_to_vdd_ratio,3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio;
        p_short_circuit_discharge_low 	= 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio;
        p_short_circuit_charge_low 		= 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_n*fo_n/fanout*beta_ratio;
 //	double t1, t2, t3, t4, t5;
 //	t1=pow(((vdd-vt)-vt_to_vdd_ratio),3);
 //	t2=pow(velocity_index,2.0);
 //	t3=pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio);
 //	t4=t1/t2/t3;
 //	cout <<t1<<"t1\n"<<t2<<"t2\n"<<t3<<"t3\n"<<t4<<"t4\n"<<fanout<<endl;
        p_short_circuit_discharge_high 	= pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_p/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
        p_short_circuit_charge_high 	= pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_n/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
 //	t1=pow(((vdd-vt)-vt_to_vdd_ratio),1.5);
 //	t2=pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
 //	t3=t1/t2;
 //	cout <<t1<<"t1\n"<<t2<<"t2\n"<<t3<<"t3\n"<<t4<<"t4\n"<<fanout<<endl;
 //	p_short_circuit_discharge = 1.0/(1.0/p_short_circuit_discharge_low + 1.0/p_short_circuit_discharge_high);
 //	p_short_circuit_charge = 1/(1/p_short_circuit_charge_low + 1/p_short_circuit_charge_high); //harmmoic mean cannot be applied simple formulas.
        p_short_circuit_discharge = p_short_circuit_discharge_low;
        p_short_circuit_charge = p_short_circuit_charge_low;
        p_short_circuit = (p_short_circuit_discharge + p_short_circuit_charge)/2;
  return (p_short_circuit);
 }
 double shortcircuit(
    double vt,
    double velocity_index,
    double c_in,
    double c_out,
    double w_nmos,
    double w_pmos,
    double i_on_n,
    double i_on_p,
    double i_on_n_in,
    double i_on_p_in,
    double vdd)
 {
        double p_short_circuit=0, p_short_circuit_discharge;//, p_short_circuit_charge, p_short_circuit_discharge_low, p_short_circuit_discharge_high, p_short_circuit_charge_low, p_short_circuit_charge_high; //this is actually energy
        double fo_n, fo_p, fanout, beta_ratio, vt_to_vdd_ratio;
        double f_alpha, k_v, e, g_v_alpha, h_v_alpha;
        fo_n		= i_on_n/i_on_n_in;
        fo_p		= i_on_p/i_on_p_in;
        fanout		= 1;
        beta_ratio 	= i_on_p/i_on_n;
        vt_to_vdd_ratio = vt/vdd;
        e 			= 	2.71828;
        f_alpha		=	1/(velocity_index+2) -velocity_index/(2*(velocity_index+3)) +velocity_index/(velocity_index+4)*(velocity_index/2-1);
        k_v			=	0.9/0.8+(vdd-vt)/0.8*log(10*(vdd-vt)/e);
        g_v_alpha	=	(velocity_index + 1)*pow((1-velocity_index),velocity_index)*pow((1-velocity_index),velocity_index/2)/f_alpha/pow((1-velocity_index-velocity_index),(velocity_index/2+velocity_index+2));
        h_v_alpha	=   pow(2, velocity_index)*(velocity_index+1)*pow((1-velocity_index),velocity_index)/pow((1-velocity_index-velocity_index),(velocity_index+1));
        //p_short_circuit_discharge_low 	= 10/3*(pow(0.5-vt_to_vdd_ratio,3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio;
 //	p_short_circuit_discharge_low 	= 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio;
 //	p_short_circuit_charge_low 		= 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_n*fo_n/fanout*beta_ratio;
 //	double t1, t2, t3, t4, t5;
 //	t1=pow(((vdd-vt)-vt_to_vdd_ratio),3);
 //	t2=pow(velocity_index,2.0);
 //	t3=pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio);
 //	t4=t1/t2/t3;
 //
 //	cout <<t1<<"t1\n"<<t2<<"t2\n"<<t3<<"t3\n"<<t4<<"t4\n"<<fanout<<endl;
 //
 //
 //	p_short_circuit_discharge_high 	= pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_p/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
 //	p_short_circuit_charge_high 	= pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_n/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
 //
 //	p_short_circuit_discharge = 1.0/(1.0/p_short_circuit_discharge_low + 1.0/p_short_circuit_discharge_high);
 //	p_short_circuit_charge = 1/(1/p_short_circuit_charge_low + 1/p_short_circuit_charge_high);
 //
 //	p_short_circuit = (p_short_circuit_discharge + p_short_circuit_charge)/2;
 //
 //	p_short_circuit = p_short_circuit_discharge;
        p_short_circuit_discharge = k_v*vdd*vdd*c_in*fo_p*fo_p/((vdd-vt)*g_v_alpha*fanout*beta_ratio/2/k_v + h_v_alpha*fo_p);
  return (p_short_circuit);
 }
--- a/ext/mcpat/cacti/basic_circuit.h
+++ b/ext/mcpat/cacti/basic_circuit.h
@ -0,0 +1,248 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __BASIC_CIRCUIT_H__
 #define __BASIC_CIRCUIT_H__
 #include "cacti_interface.h"
 #include "const.h"
 using namespace std;
 #define UNI_LEAK_STACK_FACTOR 0.43
 int powers (int base, int n);
 bool is_pow2(int64_t val);
 uint32_t _log2(uint64_t num);
 int factorial(int n, int m = 1);
 int combination(int n, int m);
 //#define DBG
 #ifdef DBG
    #define PRINTDW(a);\
    a;
 #else
    #define PRINTDW(a);\
 #endif
 enum Wire_placement {
    outside_mat,
    inside_mat,
    local_wires
 };
 enum Htree_type {
    Add_htree,
    Data_in_htree,
    Data_out_htree,
    Search_in_htree,
    Search_out_htree,
 };
 enum Gate_type {
    nmos,
    pmos,
        inv,
    nand,
    nor,
    tri,
    tg
 };
 enum Half_net_topology {
    parallel,
    series
 };
 double logtwo (double x);
 double gate_C(
    double width,
    double wirelength,
    bool _is_dram = false,
    bool _is_sram = false,
    bool _is_wl_tr = false);
 double gate_C_pass(
    double width,
    double wirelength,
    bool   _is_dram = false,
    bool   _is_sram = false,
    bool   _is_wl_tr = false);
 double drain_C_(
    double width,
    int nchannel,
    int stack,
    int next_arg_thresh_folding_width_or_height_cell,
    double fold_dimension,
    bool _is_dram = false,
    bool _is_sram = false,
    bool _is_wl_tr = false);
 double tr_R_on(
    double width,
    int nchannel,
    int stack,
    bool _is_dram = false,
    bool _is_sram = false,
    bool _is_wl_tr = false);
 double R_to_w(
    double res,
    int nchannel,
    bool _is_dram = false,
    bool _is_sram = false,
    bool _is_wl_tr = false);
 double horowitz (
    double inputramptime,
    double tf,
    double vs1,
    double vs2,
    int rise);
 double pmos_to_nmos_sz_ratio(
    bool _is_dram = false,
    bool _is_wl_tr = false);
 double simplified_nmos_leakage(
    double nwidth,
    bool _is_dram = false,
    bool _is_cell = false,
    bool _is_wl_tr = false);
 double simplified_pmos_leakage(
    double pwidth,
    bool _is_dram = false,
    bool _is_cell = false,
    bool _is_wl_tr = false);
 double cmos_Ileak(
    double nWidth,
    double pWidth,
    bool _is_dram = false,
    bool _is_cell = false,
    bool _is_wl_tr = false);
 double cmos_Ig_n(
    double nWidth,
    bool _is_dram = false,
    bool _is_cell = false,
    bool _is_wl_tr= false);
 double cmos_Ig_p(
    double pWidth,
    bool _is_dram = false,
    bool _is_cell = false,
    bool _is_wl_tr= false);
 double cmos_Isub_leakage(
    double nWidth,
    double pWidth,
    int    fanin,
    enum Gate_type g_type,
    bool _is_dram = false,
    bool _is_cell = false,
    bool _is_wl_tr = false,
    enum Half_net_topology topo = series);
 double cmos_Ig_leakage(
    double nWidth,
    double pWidth,
    int    fanin,
    enum Gate_type g_type,
    bool _is_dram = false,
    bool _is_cell = false,
    bool _is_wl_tr = false,
    enum Half_net_topology topo = series);
 double shortcircuit(
    double vt,
    double velocity_index,
    double c_in,
    double c_out,
    double w_nmos,
    double w_pmos,
    double i_on_n,
    double i_on_p,
    double i_on_n_in,
    double i_on_p_in,
    double vdd);
 double shortcircuit_simple(
    double vt,
    double velocity_index,
    double c_in,
    double c_out,
    double w_nmos,
    double w_pmos,
    double i_on_n,
    double i_on_p,
    double i_on_n_in,
    double i_on_p_in,
    double vdd);
 //set power point product mask; strictly speaking this is not real point product
 inline void set_pppm(
        double * pppv,
        double a=1,
    double b=1,
    double c=1,
    double d=1
    ){
                pppv[0]= a;
                pppv[1]= b;
                pppv[2]= c;
                pppv[3]= d;
 }
 inline void set_sppm(
        double * sppv,
        double a=1,
    double b=1,
    double c=1,
    double d=1
    ){
                sppv[0]= a;
                sppv[1]= b;
                sppv[2]= c;
 }
 #endif
--- a/ext/mcpat/cacti/batch_tests
+++ b/ext/mcpat/cacti/batch_tests
@ -0,0 +1,41 @@
 rm -rf ./out.csv
 ./cacti 8192     64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 16384    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 32768    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 65536    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 131072   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 262144   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 524288   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 1048576  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 2097152  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 4194304  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 8388608  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 8192     64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 16384    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 32768    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 65536    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 131072   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 262144   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 524288   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 1048576  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 2097152  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 4194304  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 8388608  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
 ./cacti 8192     64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
 ./cacti 16384    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
 ./cacti 32768    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
 ./cacti 65536    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
 ./cacti 131072   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
 ./cacti 262144   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
 ./cacti 524288   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
 ./cacti 1048576  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
 ./cacti 2097152  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
 ./cacti 4194304  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
 ./cacti 8388608  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
 ./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
 ./cacti 2097152  64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
 ./cacti 4194304  64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
 ./cacti 8388608  64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
 ./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
--- a/ext/mcpat/cacti/cache.cfg
+++ b/ext/mcpat/cacti/cache.cfg
@ -0,0 +1,175 @@
 # Cache size
 //-size (bytes) 2048
 //-size (bytes) 4096
 //-size (bytes) 32768
 //-size (bytes) 262144
 //-size (bytes) 1048576
 //-size (bytes) 2097152
 //-size (bytes) 4194304
 //-size (bytes) 8388608
 //-size (bytes) 16777216
 //-size (bytes) 33554432
 //-size (bytes) 134217728
 //-size (bytes) 67108864
 -size (bytes) 1073741824
 # Line size
 //-block size (bytes) 8
 -block size (bytes) 64
 # To model Fully Associative cache, set associativity to zero
 //-associativity 0
 //-associativity 2
 //-associativity 4
 -associativity 8
 //-associativity 16
 -read-write port 1
 -exclusive read port 0
 -exclusive write port 0
 -single ended read ports 0
 # Multiple banks connected using a bus
 -UCA bank count 1
 -technology (u) 0.022
 //-technology (u) 0.040
 //-technology (u) 0.032
 //-technology (u) 0.090
 # following three parameters are meaningful only for main memories
 -page size (bits) 8192 
 -burst length 8
 -internal prefetch width 8
 # following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
 -Data array cell type - "itrs-hp"
 //-Data array cell type - "itrs-lstp"
 //-Data array cell type - "itrs-lop"
 # following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
 -Data array peripheral type - "itrs-hp"
 //-Data array peripheral type - "itrs-lstp"
 //-Data array peripheral type - "itrs-lop"
 # following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
 -Tag array cell type - "itrs-hp"
 //-Tag array cell type - "itrs-lstp"
 //-Tag array cell type - "itrs-lop"
 # following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
 -Tag array peripheral type - "itrs-hp"
 //-Tag array peripheral type - "itrs-lstp"
 //-Tag array peripheral type - "itrs-lop
 # Bus width include data bits and address bits required by the decoder
 //-output/input bus width 16
 -output/input bus width 512
 // 300-400 in steps of 10
 -operating temperature (K) 360
 # Type of memory - cache (with a tag array) or ram (scratch ram similar to a register file) 
 # or main memory (no tag array and every access will happen at a page granularity Ref: CACTI 5.3 report)
 -cache type "cache"
 //-cache type "ram"
 //-cache type "main memory"
 # to model special structure like branch target buffers, directory, etc. 
 # change the tag size parameter
 # if you want cacti to calculate the tagbits, set the tag size to "default"
 -tag size (b) "default"
 //-tag size (b) 22
 # fast - data and tag access happen in parallel
 # sequential - data array is accessed after accessing the tag array
 # normal - data array lookup and tag access happen in parallel
 #          final data block is broadcasted in data array h-tree 
 #          after getting the signal from the tag array
 //-access mode (normal, sequential, fast) - "fast"
 -access mode (normal, sequential, fast) - "normal"
 //-access mode (normal, sequential, fast) - "sequential"
 # DESIGN OBJECTIVE for UCA (or banks in NUCA)
 -design objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:100:0
 # Percentage deviation from the minimum value 
 # Ex: A deviation value of 10:1000:1000:1000:1000 will try to find an organization
 # that compromises at most 10% delay. 
 # NOTE: Try reasonable values for % deviation. Inconsistent deviation
 # percentage values will not produce any valid organizations. For example,
 # 0:0:100:100:100 will try to identify an organization that has both
 # least delay and dynamic power. Since such an organization is not possible, CACTI will
 # throw an error. Refer CACTI-6 Technical report for more details
 -deviate (delay, dynamic power, leakage power, cycle time, area) 20:100000:100000:100000:100000
 # Objective for NUCA
 -NUCAdesign objective (weight delay, dynamic power, leakage power, cycle time, area) 100:100:0:0:100
 -NUCAdeviate (delay, dynamic power, leakage power, cycle time, area) 10:10000:10000:10000:10000
 # Set optimize tag to ED or ED^2 to obtain a cache configuration optimized for
 # energy-delay or energy-delay sq. product
 # Note: Optimize tag will disable weight or deviate values mentioned above
 # Set it to NONE to let weight and deviate values determine the 
 # appropriate cache configuration
 //-Optimize ED or ED^2 (ED, ED^2, NONE): "ED"
 -Optimize ED or ED^2 (ED, ED^2, NONE): "ED^2"
 //-Optimize ED or ED^2 (ED, ED^2, NONE): "NONE"
 -Cache model (NUCA, UCA)  - "UCA"
 //-Cache model (NUCA, UCA)  - "NUCA"
 # In order for CACTI to find the optimal NUCA bank value the following
 # variable should be assigned 0.
 -NUCA bank count 0
 # NOTE: for nuca network frequency is set to a default value of 
 # 5GHz in time.c. CACTI automatically
 # calculates the maximum possible frequency and downgrades this value if necessary
 # By default CACTI considers both full-swing and low-swing 
 # wires to find an optimal configuration. However, it is possible to 
 # restrict the search space by changing the signalling from "default" to 
 # "fullswing" or "lowswing" type.
 //-Wire signalling (fullswing, lowswing, default) - "Global_10"
 -Wire signalling (fullswing, lowswing, default) - "default"
 //-Wire signalling (fullswing, lowswing, default) - "lowswing"
 //-Wire inside mat - "global"
 -Wire inside mat - "semi-global"
 //-Wire outside mat - "global"
 -Wire outside mat - "semi-global"
 //-Interconnect projection - "conservative"
 -Interconnect projection - "aggressive"
 # Contention in network (which is a function of core count and cache level) is one of
 # the critical factor used for deciding the optimal bank count value
 # core count can be 4, 8, or 16
 //-Core count 4
 -Core count 8
 //-Core count 16
 -Cache level (L2/L3) - "L3"
 -Add ECC - "true"
 //-Print level (DETAILED, CONCISE) - "CONCISE"
 -Print level (DETAILED, CONCISE) - "DETAILED"
 # for debugging
 //-Print input parameters - "true"
 -Print input parameters - "false"
 # force CACTI to model the cache with the 
 # following Ndbl, Ndwl, Nspd, Ndsam,
 # and Ndcm values
 //-Force cache config - "true"
 -Force cache config - "false"
 -Ndwl 1
 -Ndbl 1
 -Nspd 0
 -Ndcm 1
 -Ndsam1 0
 -Ndsam2 0
--- a/ext/mcpat/cacti/cacti.i
+++ b/ext/mcpat/cacti/cacti.i
@ -0,0 +1,8 @@
 %module cacti
 %{
 /* Includes the header in the wrapper code */
 #include "cacti_interface.h"
 %}
 /* Parse the header file to generate wrappers */
 %include "cacti_interface.h"
--- a/ext/mcpat/cacti/cacti.mk
+++ b/ext/mcpat/cacti/cacti.mk
@ -0,0 +1,51 @@
 TARGET = cacti
 SHELL = /bin/sh
 .PHONY: all depend clean
 .SUFFIXES: .cc .o
 ifndef NTHREADS
  NTHREADS = 8
 endif
 LIBS = 
 INCS = -lm
 ifeq ($(TAG),dbg)
  DBG = -Wall 
  OPT = -ggdb -g -O0 -DNTHREADS=1  -gstabs+
 else
  DBG = 
  OPT = -O3 -msse2 -mfpmath=sse -DNTHREADS=$(NTHREADS)
 endif
 #CXXFLAGS = -Wall -Wno-unknown-pragmas -Winline $(DBG) $(OPT) 
 CXXFLAGS = -Wno-unknown-pragmas $(DBG) $(OPT) 
 CXX = g++ -m32
 CC  = gcc -m32
 SRCS  = area.cc bank.cc mat.cc main.cc Ucache.cc io.cc technology.cc basic_circuit.cc parameter.cc \
 		decoder.cc component.cc uca.cc subarray.cc wire.cc htree2.cc \
 		cacti_interface.cc router.cc nuca.cc crossbar.cc arbiter.cc 
 OBJS = $(patsubst %.cc,obj_$(TAG)/%.o,$(SRCS))
 PYTHONLIB_SRCS = $(patsubst main.cc, ,$(SRCS)) obj_$(TAG)/cacti_wrap.cc
 PYTHONLIB_OBJS = $(patsubst %.cc,%.o,$(PYTHONLIB_SRCS)) 
 INCLUDES       = -I /usr/include/python2.4 -I /usr/lib/python2.4/config
 all: obj_$(TAG)/$(TARGET)
 	cp -f obj_$(TAG)/$(TARGET) $(TARGET)
 obj_$(TAG)/$(TARGET) : $(OBJS)
 	$(CXX) $(OBJS) -o $@ $(INCS) $(CXXFLAGS) $(LIBS) -pthread
 #obj_$(TAG)/%.o : %.cc
 #	$(CXX) -c $(CXXFLAGS) $(INCS) -o $@ $<
 obj_$(TAG)/%.o : %.cc
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 clean:
 	-rm -f *.o _cacti.so cacti.py $(TARGET)
--- a/ext/mcpat/cacti/cacti_interface.cc
+++ b/ext/mcpat/cacti/cacti_interface.cc
@ -0,0 +1,173 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <pthread.h>
 #include <algorithm>
 #include <cmath>
 #include <ctime>
 #include <iostream>
 #include "Ucache.h"
 #include "area.h"
 #include "basic_circuit.h"
 #include "cacti_interface.h"
 #include "component.h"
 #include "const.h"
 #include "parameter.h"
 using namespace std;
 bool mem_array::lt(const mem_array * m1, const mem_array * m2)
 {
  if (m1->Nspd < m2->Nspd) return true;
  else if (m1->Nspd > m2->Nspd) return false;
  else if (m1->Ndwl < m2->Ndwl) return true;
  else if (m1->Ndwl > m2->Ndwl) return false;
  else if (m1->Ndbl < m2->Ndbl) return true;
  else if (m1->Ndbl > m2->Ndbl) return false;
  else if (m1->deg_bl_muxing < m2->deg_bl_muxing) return true;
  else if (m1->deg_bl_muxing > m2->deg_bl_muxing) return false;
  else if (m1->Ndsam_lev_1 < m2->Ndsam_lev_1) return true;
  else if (m1->Ndsam_lev_1 > m2->Ndsam_lev_1) return false;
  else if (m1->Ndsam_lev_2 < m2->Ndsam_lev_2) return true;
  else return false;
 }
 void uca_org_t::find_delay()
 {
  mem_array * data_arr = data_array2;
  mem_array * tag_arr  = tag_array2;
  // check whether it is a regular cache or scratch ram
  if (g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)
  {
    access_time = data_arr->access_time;
  }
  // Both tag and data lookup happen in parallel
  // and the entire set is sent over the data array h-tree without
  // waiting for the way-select signal --TODO add the corresponding
  // power overhead Nav
  else if (g_ip->fast_access == true)
  {
    access_time = MAX(tag_arr->access_time, data_arr->access_time);
  }
  // Tag is accessed first. On a hit, way-select signal along with the
  // address is sent to read/write the appropriate block in the data
  // array
  else if (g_ip->is_seq_acc == true)
  {
    access_time = tag_arr->access_time + data_arr->access_time;
  }
  // Normal access: tag array access and data array access happen in parallel.
  // But, the data array will wait for the way-select and transfer only the
  // appropriate block over the h-tree.
  else
  {
    access_time = MAX(tag_arr->access_time + data_arr->delay_senseamp_mux_decoder,
                      data_arr->delay_before_subarray_output_driver) +
                  data_arr->delay_from_subarray_output_driver_to_output;
  }
 }
 void uca_org_t::find_energy()
 {
  if (!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc))//(g_ip->is_cache)
    power = data_array2->power + tag_array2->power;
  else
    power = data_array2->power;
 }
 void uca_org_t::find_area()
 {
  if (g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)//(g_ip->is_cache == false)
  {
    cache_ht  = data_array2->height;
    cache_len = data_array2->width;
  }
  else
  {
    cache_ht  = MAX(tag_array2->height, data_array2->height);
    cache_len = tag_array2->width + data_array2->width;
  }
  area = cache_ht * cache_len;
 }
 void uca_org_t::adjust_area()
 {
  double area_adjust;
  if (g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)
  {
    if (data_array2->area_efficiency/100.0<0.2)
    {
        //area_adjust = sqrt(area/(area*(data_array2->area_efficiency/100.0)/0.2));
        area_adjust = sqrt(0.2/(data_array2->area_efficiency/100.0));
        cache_ht  = cache_ht/area_adjust;
        cache_len = cache_len/area_adjust;
    }
  }
  area = cache_ht * cache_len;
 }
 void uca_org_t::find_cyc()
 {
  if ((g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc))//(g_ip->is_cache == false)
  {
    cycle_time = data_array2->cycle_time;
  }
  else
  {
    cycle_time = MAX(tag_array2->cycle_time,
                    data_array2->cycle_time);
  }
 }
 uca_org_t :: uca_org_t()
 :tag_array2(0),
 data_array2(0)
 {
 }
 void uca_org_t :: cleanup()
 {
          if (data_array2!=0)
                  delete data_array2;
          if (tag_array2!=0)
                  delete tag_array2;
 }
--- a/ext/mcpat/cacti/cacti_interface.h
+++ b/ext/mcpat/cacti/cacti_interface.h
@ -0,0 +1,633 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __CACTI_INTERFACE_H__
 #define __CACTI_INTERFACE_H__
 #include <iostream>
 #include <list>
 #include <map>
 #include <string>
 #include <vector>
 #include "const.h"
 using namespace std;
 class min_values_t;
 class mem_array;
 class uca_org_t;
 class powerComponents
 {
  public:
    double dynamic;
    double leakage;
    double gate_leakage;
    double short_circuit;
    double longer_channel_leakage;
    powerComponents() : dynamic(0), leakage(0), gate_leakage(0), short_circuit(0), longer_channel_leakage(0)  { }
    powerComponents(const powerComponents & obj) { *this = obj; }
    powerComponents & operator=(const powerComponents & rhs)
    {
      dynamic = rhs.dynamic;
      leakage = rhs.leakage;
      gate_leakage  = rhs.gate_leakage;
      short_circuit = rhs.short_circuit;
      longer_channel_leakage = rhs.longer_channel_leakage;
      return *this;
    }
    void reset() { dynamic = 0; leakage = 0; gate_leakage = 0; short_circuit = 0;longer_channel_leakage = 0;}
    friend powerComponents operator+(const powerComponents & x, const powerComponents & y);
    friend powerComponents operator*(const powerComponents & x, double const * const y);
 };
 class powerDef
 {
  public:
    powerComponents readOp;
    powerComponents writeOp;
    powerComponents searchOp;//Sheng: for CAM and FA
    powerDef() : readOp(), writeOp(), searchOp() { }
    void reset() { readOp.reset(); writeOp.reset(); searchOp.reset();}
    friend powerDef operator+(const powerDef & x, const powerDef & y);
    friend powerDef operator*(const powerDef & x, double const * const y);
 };
 enum Wire_type
 {
    Global /* gloabl wires with repeaters */,
    Global_5 /* 5% delay penalty */,
    Global_10 /* 10% delay penalty */,
    Global_20 /* 20% delay penalty */,
    Global_30 /* 30% delay penalty */,
    Low_swing /* differential low power wires with high area overhead */,
    Semi_global /* mid-level wires with repeaters*/,
    Transmission /* tranmission lines with high area overhead */,
    Optical /* optical wires */,
    Invalid_wtype
 };
 class InputParameter
 {
  public:
    void parse_cfg(const string & infile);
    bool error_checking();  // return false if the input parameters are problematic
    void display_ip();
    unsigned int cache_sz;  // in bytes
    unsigned int line_sz;
    unsigned int assoc;
    unsigned int nbanks;
    unsigned int out_w;// == nr_bits_out
    bool     specific_tag;
    unsigned int tag_w;
    unsigned int access_mode;
    unsigned int obj_func_dyn_energy;
    unsigned int obj_func_dyn_power;
    unsigned int obj_func_leak_power;
    unsigned int obj_func_cycle_t;
    double   F_sz_nm;          // feature size in nm
    double   F_sz_um;          // feature size in um
    unsigned int num_rw_ports;
    unsigned int num_rd_ports;
    unsigned int num_wr_ports;
    unsigned int num_se_rd_ports;  // number of single ended read ports
    unsigned int num_search_ports;  // Sheng: number of search ports for CAM
    bool     is_main_mem;
    bool     is_cache;
    bool     pure_ram;
    bool     pure_cam;
    bool     rpters_in_htree;  // if there are repeaters in htree segment
    unsigned int ver_htree_wires_over_array;
    unsigned int broadcast_addr_din_over_ver_htrees;
    unsigned int temp;
    unsigned int ram_cell_tech_type;
    unsigned int peri_global_tech_type;
    unsigned int data_arr_ram_cell_tech_type;
    unsigned int data_arr_peri_global_tech_type;
    unsigned int tag_arr_ram_cell_tech_type;
    unsigned int tag_arr_peri_global_tech_type;
    unsigned int burst_len;
    unsigned int int_prefetch_w;
    unsigned int page_sz_bits;
    unsigned int ic_proj_type;      // interconnect_projection_type
    unsigned int wire_is_mat_type;  // wire_inside_mat_type
    unsigned int wire_os_mat_type; // wire_outside_mat_type
    enum Wire_type wt;
    int force_wiretype;
    bool print_input_args;
    unsigned int nuca_cache_sz; // TODO
    int ndbl, ndwl, nspd, ndsam1, ndsam2, ndcm;
    bool force_cache_config;
    int cache_level;
    int cores;
    int nuca_bank_count;
    int force_nuca_bank;
    int delay_wt, dynamic_power_wt, leakage_power_wt,
        cycle_time_wt, area_wt;
    int delay_wt_nuca, dynamic_power_wt_nuca, leakage_power_wt_nuca,
        cycle_time_wt_nuca, area_wt_nuca;
    int delay_dev, dynamic_power_dev, leakage_power_dev,
        cycle_time_dev, area_dev;
    int delay_dev_nuca, dynamic_power_dev_nuca, leakage_power_dev_nuca,
        cycle_time_dev_nuca, area_dev_nuca;
    int ed; //ED or ED2 optimization
    int nuca;
    bool     fast_access;
    unsigned int block_sz;  // bytes
    unsigned int tag_assoc;
    unsigned int data_assoc;
    bool     is_seq_acc;
    bool     fully_assoc;
    unsigned int nsets;  // == number_of_sets
    int print_detail;
    bool     add_ecc_b_;
  //parameters for design constraint
  double throughput;
  double latency;
  bool pipelinable;
  int pipeline_stages;
  int per_stage_vector;
  bool with_clock_grid;
 };
 typedef struct{
  int Ndwl;
  int Ndbl;
  double Nspd;
  int deg_bl_muxing;
  int Ndsam_lev_1;
  int Ndsam_lev_2;
  int number_activated_mats_horizontal_direction;
  int number_subbanks;
  int page_size_in_bits;
  double delay_route_to_bank;
  double delay_crossbar;
  double delay_addr_din_horizontal_htree;
  double delay_addr_din_vertical_htree;
  double delay_row_predecode_driver_and_block;
  double delay_row_decoder;
  double delay_bitlines;
  double delay_sense_amp;
  double delay_subarray_output_driver;
  double delay_bit_mux_predecode_driver_and_block;
  double delay_bit_mux_decoder;
  double delay_senseamp_mux_lev_1_predecode_driver_and_block;
  double delay_senseamp_mux_lev_1_decoder;
  double delay_senseamp_mux_lev_2_predecode_driver_and_block;
  double delay_senseamp_mux_lev_2_decoder;
  double delay_input_htree;
  double delay_output_htree;
  double delay_dout_vertical_htree;
  double delay_dout_horizontal_htree;
  double delay_comparator;
  double access_time;
  double cycle_time;
  double multisubbank_interleave_cycle_time;
  double delay_request_network;
  double delay_inside_mat;
  double delay_reply_network;
  double trcd;
  double cas_latency;
  double precharge_delay;
  powerDef power_routing_to_bank;
  powerDef power_addr_input_htree;
  powerDef power_data_input_htree;
  powerDef power_data_output_htree;
  powerDef power_addr_horizontal_htree;
  powerDef power_datain_horizontal_htree;
  powerDef power_dataout_horizontal_htree;
  powerDef power_addr_vertical_htree;
  powerDef power_datain_vertical_htree;
  powerDef power_row_predecoder_drivers;
  powerDef power_row_predecoder_blocks;
  powerDef power_row_decoders;
  powerDef power_bit_mux_predecoder_drivers;
  powerDef power_bit_mux_predecoder_blocks;
  powerDef power_bit_mux_decoders;
  powerDef power_senseamp_mux_lev_1_predecoder_drivers;
  powerDef power_senseamp_mux_lev_1_predecoder_blocks;
  powerDef power_senseamp_mux_lev_1_decoders;
  powerDef power_senseamp_mux_lev_2_predecoder_drivers;
  powerDef power_senseamp_mux_lev_2_predecoder_blocks;
  powerDef power_senseamp_mux_lev_2_decoders;
  powerDef power_bitlines;
  powerDef power_sense_amps;
  powerDef power_prechg_eq_drivers;
  powerDef power_output_drivers_at_subarray;
  powerDef power_dataout_vertical_htree;
  powerDef power_comparators;
  powerDef power_crossbar;
  powerDef total_power;
  double area;
  double all_banks_height;
  double all_banks_width;
  double bank_height;
  double bank_width;
  double subarray_memory_cell_area_height;
  double subarray_memory_cell_area_width;
  double mat_height;
  double mat_width;
  double routing_area_height_within_bank;
  double routing_area_width_within_bank;
  double area_efficiency;
 //  double perc_power_dyn_routing_to_bank;
 //  double perc_power_dyn_addr_horizontal_htree;
 //  double perc_power_dyn_datain_horizontal_htree;
 //  double perc_power_dyn_dataout_horizontal_htree;
 //  double perc_power_dyn_addr_vertical_htree;
 //  double perc_power_dyn_datain_vertical_htree;
 //  double perc_power_dyn_row_predecoder_drivers;
 //  double perc_power_dyn_row_predecoder_blocks;
 //  double perc_power_dyn_row_decoders;
 //  double perc_power_dyn_bit_mux_predecoder_drivers;
 //  double perc_power_dyn_bit_mux_predecoder_blocks;
 //  double perc_power_dyn_bit_mux_decoders;
 //  double perc_power_dyn_senseamp_mux_lev_1_predecoder_drivers;
 //  double perc_power_dyn_senseamp_mux_lev_1_predecoder_blocks;
 //  double perc_power_dyn_senseamp_mux_lev_1_decoders;
 //  double perc_power_dyn_senseamp_mux_lev_2_predecoder_drivers;
 //  double perc_power_dyn_senseamp_mux_lev_2_predecoder_blocks;
 //  double perc_power_dyn_senseamp_mux_lev_2_decoders;
 //  double perc_power_dyn_bitlines;
 //  double perc_power_dyn_sense_amps;
 //  double perc_power_dyn_prechg_eq_drivers;
 //  double perc_power_dyn_subarray_output_drivers;
 //  double perc_power_dyn_dataout_vertical_htree;
 //  double perc_power_dyn_comparators;
 //  double perc_power_dyn_crossbar;
 //  double perc_power_dyn_spent_outside_mats;
 //  double perc_power_leak_routing_to_bank;
 //  double perc_power_leak_addr_horizontal_htree;
 //  double perc_power_leak_datain_horizontal_htree;
 //  double perc_power_leak_dataout_horizontal_htree;
 //  double perc_power_leak_addr_vertical_htree;
 //  double perc_power_leak_datain_vertical_htree;
 //  double perc_power_leak_row_predecoder_drivers;
 //  double perc_power_leak_row_predecoder_blocks;
 //  double perc_power_leak_row_decoders;
 //  double perc_power_leak_bit_mux_predecoder_drivers;
 //  double perc_power_leak_bit_mux_predecoder_blocks;
 //  double perc_power_leak_bit_mux_decoders;
 //  double perc_power_leak_senseamp_mux_lev_1_predecoder_drivers;
 //  double perc_power_leak_senseamp_mux_lev_1_predecoder_blocks;
 //  double perc_power_leak_senseamp_mux_lev_1_decoders;
 //  double perc_power_leak_senseamp_mux_lev_2_predecoder_drivers;
 //  double perc_power_leak_senseamp_mux_lev_2_predecoder_blocks;
 //  double perc_power_leak_senseamp_mux_lev_2_decoders;
 //  double perc_power_leak_bitlines;
 //  double perc_power_leak_sense_amps;
 //  double perc_power_leak_prechg_eq_drivers;
 //  double perc_power_leak_subarray_output_drivers;
 //  double perc_power_leak_dataout_vertical_htree;
 //  double perc_power_leak_comparators;
 //  double perc_power_leak_crossbar;
 //  double perc_leak_mats;
 //  double perc_active_mats;
  double refresh_power;
  double dram_refresh_period;
  double dram_array_availability;
  double dyn_read_energy_from_closed_page;
  double dyn_read_energy_from_open_page;
  double leak_power_subbank_closed_page;
  double leak_power_subbank_open_page;
  double leak_power_request_and_reply_networks;
  double activate_energy;
  double read_energy;
  double write_energy;
  double precharge_energy;
 } results_mem_array;
 class uca_org_t
 {
  public:
    mem_array * tag_array2;
    mem_array * data_array2;
    double access_time;
    double cycle_time;
    double area;
    double area_efficiency;
    powerDef power;
    double leak_power_with_sleep_transistors_in_mats;
    double cache_ht;
    double cache_len;
    char file_n[100];
    double vdd_periph_global;
    bool valid;
    results_mem_array tag_array;
    results_mem_array data_array;
    uca_org_t();
    void find_delay();
    void find_energy();
    void find_area();
    void find_cyc();
    void adjust_area();//for McPAT only to adjust routing overhead
    void cleanup();
    ~uca_org_t(){};
 };
 void reconfigure(InputParameter *local_interface, uca_org_t *fin_res);
 uca_org_t cacti_interface(const string & infile_name);
 //McPAT's plain interface, please keep !!!
 uca_org_t cacti_interface(InputParameter * const local_interface);
 //McPAT's plain interface, please keep !!!
 uca_org_t init_interface(InputParameter * const local_interface);
 //McPAT's plain interface, please keep !!!
 uca_org_t cacti_interface(
            int cache_size,
            int line_size,
            int associativity,
            int rw_ports,
            int excl_read_ports,
            int excl_write_ports,
            int single_ended_read_ports,
            int search_ports,
            int banks,
            double tech_node,
            int output_width,
            int specific_tag,
            int tag_width,
            int access_mode,
            int cache,
            int main_mem,
            int obj_func_delay,
            int obj_func_dynamic_power,
            int obj_func_leakage_power,
            int obj_func_cycle_time,
            int obj_func_area,
            int dev_func_delay,
            int dev_func_dynamic_power,
            int dev_func_leakage_power,
            int dev_func_area,
            int dev_func_cycle_time,
            int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate
            int temp,
            int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing
            int data_arr_ram_cell_tech_flavor_in,
            int data_arr_peri_global_tech_flavor_in,
            int tag_arr_ram_cell_tech_flavor_in,
            int tag_arr_peri_global_tech_flavor_in,
            int interconnect_projection_type_in,
            int wire_inside_mat_type_in,
            int wire_outside_mat_type_in,
            int REPEATERS_IN_HTREE_SEGMENTS_in,
            int VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in,
            int BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in,
            int PAGE_SIZE_BITS_in,
            int BURST_LENGTH_in,
            int INTERNAL_PREFETCH_WIDTH_in,
            int force_wiretype,
            int wiretype,
            int force_config,
            int ndwl,
            int ndbl,
            int nspd,
            int ndcm,
            int ndsam1,
            int ndsam2,
            int ecc);
 //    int cache_size,
 //    int line_size,
 //    int associativity,
 //    int rw_ports,
 //    int excl_read_ports,
 //    int excl_write_ports,
 //    int single_ended_read_ports,
 //    int banks,
 //    double tech_node,
 //    int output_width,
 //    int specific_tag,
 //    int tag_width,
 //    int access_mode,
 //    int cache,
 //    int main_mem,
 //    int obj_func_delay,
 //    int obj_func_dynamic_power,
 //    int obj_func_leakage_power,
 //    int obj_func_area,
 //    int obj_func_cycle_time,
 //    int dev_func_delay,
 //    int dev_func_dynamic_power,
 //    int dev_func_leakage_power,
 //    int dev_func_area,
 //    int dev_func_cycle_time,
 //    int temp,
 //    int data_arr_ram_cell_tech_flavor_in,
 //    int data_arr_peri_global_tech_flavor_in,
 //    int tag_arr_ram_cell_tech_flavor_in,
 //    int tag_arr_peri_global_tech_flavor_in,
 //    int interconnect_projection_type_in,
 //    int wire_inside_mat_type_in,
 //    int wire_outside_mat_type_in,
 //    int REPEATERS_IN_HTREE_SEGMENTS_in,
 //    int VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in,
 //    int BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in,
 ////    double MAXAREACONSTRAINT_PERC_in,
 ////    double MAXACCTIMECONSTRAINT_PERC_in,
 ////    double MAX_PERC_DIFF_IN_DELAY_FROM_BEST_DELAY_REPEATER_SOLUTION_in,
 //    int PAGE_SIZE_BITS_in,
 //    int BURST_LENGTH_in,
 //    int INTERNAL_PREFETCH_WIDTH_in);
 //Naveen's interface
 uca_org_t cacti_interface(
    int cache_size,
    int line_size,
    int associativity,
    int rw_ports,
    int excl_read_ports,
    int excl_write_ports,
    int single_ended_read_ports,
    int banks,
    double tech_node,
    int page_sz,
    int burst_length,
    int pre_width,
    int output_width,
    int specific_tag,
    int tag_width,
    int access_mode, //0 normal, 1 seq, 2 fast
    int cache, //scratch ram or cache
    int main_mem,
    int obj_func_delay,
    int obj_func_dynamic_power,
    int obj_func_leakage_power,
    int obj_func_area,
    int obj_func_cycle_time,
    int dev_func_delay,
    int dev_func_dynamic_power,
    int dev_func_leakage_power,
    int dev_func_area,
    int dev_func_cycle_time,
    int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate
    int temp,
    int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing
    int data_arr_ram_cell_tech_flavor_in,
    int data_arr_peri_global_tech_flavor_in,
    int tag_arr_ram_cell_tech_flavor_in,
    int tag_arr_peri_global_tech_flavor_in,
    int interconnect_projection_type_in, // 0 - aggressive, 1 - normal
    int wire_inside_mat_type_in,
    int wire_outside_mat_type_in,
    int is_nuca, // 0 - UCA, 1 - NUCA
    int core_count,
    int cache_level, // 0 - L2, 1 - L3
    int nuca_bank_count,
    int nuca_obj_func_delay,
    int nuca_obj_func_dynamic_power,
    int nuca_obj_func_leakage_power,
    int nuca_obj_func_area,
    int nuca_obj_func_cycle_time,
    int nuca_dev_func_delay,
    int nuca_dev_func_dynamic_power,
    int nuca_dev_func_leakage_power,
    int nuca_dev_func_area,
    int nuca_dev_func_cycle_time,
    int REPEATERS_IN_HTREE_SEGMENTS_in,//TODO for now only wires with repeaters are supported
    int p_input);
 class mem_array
 {
  public:
  int    Ndcm;
  int    Ndwl;
  int    Ndbl;
  double Nspd;
  int    deg_bl_muxing;
  int    Ndsam_lev_1;
  int    Ndsam_lev_2;
  double access_time;
  double cycle_time;
  double multisubbank_interleave_cycle_time;
  double area_ram_cells;
  double area;
  powerDef power;
  double delay_senseamp_mux_decoder;
  double delay_before_subarray_output_driver;
  double delay_from_subarray_output_driver_to_output;
  double height;
  double width;
  double mat_height;
  double mat_length;
  double subarray_length;
  double subarray_height;
  double delay_route_to_bank,
         delay_input_htree,
         delay_row_predecode_driver_and_block,
         delay_row_decoder,
         delay_bitlines,
         delay_sense_amp,
         delay_subarray_output_driver,
         delay_dout_htree,
         delay_comparator,
         delay_matchlines;
  double all_banks_height,
         all_banks_width,
         area_efficiency;
  powerDef power_routing_to_bank;
  powerDef power_addr_input_htree;
  powerDef power_data_input_htree;
  powerDef power_data_output_htree;
  powerDef power_htree_in_search;
  powerDef power_htree_out_search;
  powerDef power_row_predecoder_drivers;
  powerDef power_row_predecoder_blocks;
  powerDef power_row_decoders;
  powerDef power_bit_mux_predecoder_drivers;
  powerDef power_bit_mux_predecoder_blocks;
  powerDef power_bit_mux_decoders;
  powerDef power_senseamp_mux_lev_1_predecoder_drivers;
  powerDef power_senseamp_mux_lev_1_predecoder_blocks;
  powerDef power_senseamp_mux_lev_1_decoders;
  powerDef power_senseamp_mux_lev_2_predecoder_drivers;
  powerDef power_senseamp_mux_lev_2_predecoder_blocks;
  powerDef power_senseamp_mux_lev_2_decoders;
  powerDef power_bitlines;
  powerDef power_sense_amps;
  powerDef power_prechg_eq_drivers;
  powerDef power_output_drivers_at_subarray;
  powerDef power_dataout_vertical_htree;
  powerDef power_comparators;
  powerDef power_cam_bitline_precharge_eq_drv;
  powerDef power_searchline;
  powerDef power_searchline_precharge;
  powerDef power_matchlines;
  powerDef power_matchline_precharge;
  powerDef power_matchline_to_wordline_drv;
  min_values_t *arr_min;
  enum Wire_type wt;
  // dram stats
  double activate_energy, read_energy, write_energy, precharge_energy,
  refresh_power, leak_power_subbank_closed_page, leak_power_subbank_open_page,
  leak_power_request_and_reply_networks;
  double precharge_delay;
  static bool lt(const mem_array * m1, const mem_array * m2);
 };
 #endif
--- a/ext/mcpat/cacti/component.cc
+++ b/ext/mcpat/cacti/component.cc
@ -0,0 +1,236 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <cassert>
 #include <cmath>
 #include <iostream>
 #include "bank.h"
 #include "component.h"
 #include "decoder.h"
 using namespace std;
 Component::Component()
  :area(), power(), rt_power(),delay(0)
 {
 }
 Component::~Component()
 {
 }
 double Component::compute_diffusion_width(int num_stacked_in, int num_folded_tr)
 {
  double w_poly = g_ip->F_sz_um;
  double spacing_poly_to_poly = g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact;
  double total_diff_w = 2 * spacing_poly_to_poly +  // for both source and drain
                        num_stacked_in * w_poly +
                        (num_stacked_in - 1) * g_tp.spacing_poly_to_poly;
  if (num_folded_tr > 1)
  {
    total_diff_w += (num_folded_tr - 2) * 2 * spacing_poly_to_poly +
                    (num_folded_tr - 1) * num_stacked_in * w_poly +
                    (num_folded_tr - 1) * (num_stacked_in - 1) * g_tp.spacing_poly_to_poly;
  }
  return total_diff_w;
 }
 double Component::compute_gate_area(
    int gate_type,
    int num_inputs,
    double w_pmos,
    double w_nmos,
    double h_gate)
 {
  if (w_pmos <= 0.0 || w_nmos <= 0.0)
  {
    return 0.0;
  }
  double w_folded_pmos, w_folded_nmos;
  int    num_folded_pmos, num_folded_nmos;
  double total_ndiff_w, total_pdiff_w;
  Area gate;
  double h_tr_region  = h_gate - 2 * g_tp.HPOWERRAIL;
  double ratio_p_to_n = w_pmos / (w_pmos + w_nmos);
  if (ratio_p_to_n >= 1 || ratio_p_to_n <= 0)
  {
    return 0.0;
  }
  w_folded_pmos  = (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS) * ratio_p_to_n;
  w_folded_nmos  = (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS) * (1 - ratio_p_to_n);
  assert(w_folded_pmos > 0);
  num_folded_pmos = (int) (ceil(w_pmos / w_folded_pmos));
  num_folded_nmos = (int) (ceil(w_nmos / w_folded_nmos));
  switch (gate_type)
  {
    case INV:
      total_ndiff_w = compute_diffusion_width(1, num_folded_nmos);
      total_pdiff_w = compute_diffusion_width(1, num_folded_pmos);
      break;
    case NOR:
      total_ndiff_w = compute_diffusion_width(1, num_inputs * num_folded_nmos);
      total_pdiff_w = compute_diffusion_width(num_inputs, num_folded_pmos);
      break;
    case NAND:
      total_ndiff_w = compute_diffusion_width(num_inputs, num_folded_nmos);
      total_pdiff_w = compute_diffusion_width(1, num_inputs * num_folded_pmos);
      break;
    default:
      cout << "Unknown gate type: " << gate_type << endl;
      exit(1);
  }
  gate.w = MAX(total_ndiff_w, total_pdiff_w);
  if (w_folded_nmos > w_nmos)
  {
    //means that the height of the gate can
    //be made smaller than the input height specified, so calculate the height of the gate.
    gate.h = w_nmos + w_pmos + g_tp.MIN_GAP_BET_P_AND_N_DIFFS + 2 * g_tp.HPOWERRAIL;
  }
  else
  {
    gate.h = h_gate;
  }
  return gate.get_area();
 }
 double Component::compute_tr_width_after_folding(
    double input_width,
    double threshold_folding_width)
 {//This is actually the width of the cell not the width of a device.
 //The width of a cell and the width of a device is orthogonal.
  if (input_width <= 0)
  {
    return 0;
  }
  int    num_folded_tr        = (int) (ceil(input_width / threshold_folding_width));
  double spacing_poly_to_poly = g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact;
  double width_poly           = g_ip->F_sz_um;
  double total_diff_width     = num_folded_tr * width_poly + (num_folded_tr + 1) * spacing_poly_to_poly;
  return total_diff_width;
 }
 double Component::height_sense_amplifier(double pitch_sense_amp)
 {
  // compute the height occupied by all PMOS transistors
  double h_pmos_tr = compute_tr_width_after_folding(g_tp.w_sense_p, pitch_sense_amp) * 2 +
                     compute_tr_width_after_folding(g_tp.w_iso, pitch_sense_amp) +
                     2 * g_tp.MIN_GAP_BET_SAME_TYPE_DIFFS;
  // compute the height occupied by all NMOS transistors
  double h_nmos_tr = compute_tr_width_after_folding(g_tp.w_sense_n, pitch_sense_amp) * 2 +
                     compute_tr_width_after_folding(g_tp.w_sense_en, pitch_sense_amp) +
                     2 * g_tp.MIN_GAP_BET_SAME_TYPE_DIFFS;
  // compute total height by considering gap between the p and n diffusion areas
  return h_pmos_tr + h_nmos_tr + g_tp.MIN_GAP_BET_P_AND_N_DIFFS;
 }
 int Component::logical_effort(
    int num_gates_min,
    double g,
    double F,
    double * w_n,
    double * w_p,
    double C_load,
    double p_to_n_sz_ratio,
    bool   is_dram_,
    bool   is_wl_tr_,
    double max_w_nmos)
 {
  int num_gates = (int) (log(F) / log(fopt));
  // check if num_gates is odd. if so, add 1 to make it even
  num_gates+= (num_gates % 2) ? 1 : 0;
  num_gates = MAX(num_gates, num_gates_min);
  // recalculate the effective fanout of each stage
  double f = pow(F, 1.0 / num_gates);
  int    i = num_gates - 1;
  double C_in = C_load / f;
  w_n[i]  = (1.0 / (1.0 + p_to_n_sz_ratio)) * C_in / gate_C(1, 0, is_dram_, false, is_wl_tr_);
  w_n[i]  = MAX(w_n[i], g_tp.min_w_nmos_);
  w_p[i]  = p_to_n_sz_ratio * w_n[i];
  if (w_n[i] > max_w_nmos)
  {
    double C_ld = gate_C((1 + p_to_n_sz_ratio) * max_w_nmos, 0, is_dram_, false, is_wl_tr_);
    F = g * C_ld / gate_C(w_n[0] + w_p[0], 0, is_dram_, false, is_wl_tr_);
    num_gates = (int) (log(F) / log(fopt)) + 1;
    num_gates+= (num_gates % 2) ? 1 : 0;
    num_gates = MAX(num_gates, num_gates_min);
    f = pow(F, 1.0 / (num_gates - 1));
    i = num_gates - 1;
    w_n[i]  = max_w_nmos;
    w_p[i]  = p_to_n_sz_ratio * w_n[i];
  }
  for (i = num_gates - 2; i >= 1; i--)
  {
    w_n[i] = MAX(w_n[i+1] / f, g_tp.min_w_nmos_);
    w_p[i] = p_to_n_sz_ratio * w_n[i];
  }
  assert(num_gates <= MAX_NUMBER_GATES_STAGE);
  return num_gates;
 }
--- a/ext/mcpat/cacti/component.h
+++ b/ext/mcpat/cacti/component.h
@ -0,0 +1,84 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __COMPONENT_H__
 #define __COMPONENT_H__
 #include "area.h"
 #include "parameter.h"
 using namespace std;
 class Crossbar;
 class Bank;
 class Component
 {
  public:
    Component();
    ~Component();
    Area area;
    powerDef power,rt_power;
    double delay;
    double cycle_time;
    double compute_gate_area(
        int gate_type,
        int num_inputs,
        double w_pmos,
        double w_nmos,
        double h_gate);
    double compute_tr_width_after_folding(double input_width, double threshold_folding_width);
    double height_sense_amplifier(double pitch_sense_amp);
  protected:
    int logical_effort(
        int    num_gates_min,
        double g,
        double F,
        double * w_n,
        double * w_p,
        double C_load,
        double p_to_n_sz_ratio,
        bool   is_dram_,
        bool   is_wl_tr_,
        double max_w_nmos);
  private:
    double compute_diffusion_width(int num_stacked_in, int num_folded_tr);
 };
 #endif
--- a/ext/mcpat/cacti/const.h
+++ b/ext/mcpat/cacti/const.h
@ -0,0 +1,270 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __CONST_H__
 #define __CONST_H__
 #include <math.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 /*  The following are things you might want to change
 *  when compiling
 */
 /*
 * Address bits in a word, and number of output bits from the cache
 */
 /*
 was: #define ADDRESS_BITS 32
 now: I'm using 42 bits as in the Power4,
 since that's bigger then the 36 bits on the Pentium 4
 and 40 bits on the Opteron
 */
 const int ADDRESS_BITS = 42;
 /*dt: In addition to the tag bits, the tags also include 1 valid bit, 1 dirty bit, 2 bits for a 4-state
  cache coherency protocoll (MESI), 1 bit for MRU (change this to log(ways) for full LRU).
  So in total we have 1 + 1 + 2 + 1 = 5 */
 const int EXTRA_TAG_BITS = 5;
 /* limits on the various N parameters */
 const unsigned int MAXDATAN     = 512;      // maximum for Ndwl and Ndbl
 const unsigned int MAXSUBARRAYS = 1048576;  // maximum subarrays for data and tag arrays
 const unsigned int MAXDATASPD   = 256;      // maximum for Nspd
 const unsigned int MAX_COL_MUX  = 256;
 #define ROUTER_TYPES 3
 #define WIRE_TYPES 6
 const double Cpolywire = 0;
 /* Threshold voltages (as a proportion of Vdd)
   If you don't know them, set all values to 0.5 */
 #define VTHFA1         0.452
 #define VTHFA2         0.304
 #define VTHFA3         0.420
 #define VTHFA4         0.413
 #define VTHFA5         0.405
 #define VTHFA6         0.452
 #define VSINV          0.452
 #define VTHCOMPINV     0.437
 #define VTHMUXNAND     0.548  // TODO : this constant must be revisited
 #define VTHEVALINV     0.452
 #define VTHSENSEEXTDRV 0.438
 //WmuxdrvNANDn and WmuxdrvNANDp are no longer being used but it's part of the old
 //delay_comparator function which we are using exactly as it used to be, so just setting these to 0
 const double WmuxdrvNANDn = 0;
 const double WmuxdrvNANDp = 0;
 /*===================================================================*/
 /*
 * The following are things you probably wouldn't want to change.
 */
 #define BIGNUM 1e30
 #define INF 9999999
 #define MAX(a,b) (((a)>(b))?(a):(b))
 #define MIN(a,b) (((a)<(b))?(a):(b))
 /* Used to communicate with the horowitz model */
 #define RISE 1
 #define FALL 0
 #define NCH  1
 #define PCH  0
 #define EPSILON 0.5 //v4.1: This constant is being used in order to fix floating point -> integer
 //conversion problems that were occuring within CACTI. Typical problem that was occuring was
 //that with different compilers a floating point number like 3.0 would get represented as either
 //2.9999....or 3.00000001 and then the integer part of the floating point number (3.0) would
 //be computed differently depending on the compiler. What we are doing now is to replace
 //int (x) with (int) (x+EPSILON) where EPSILON is 0.5. This would fix such problems. Note that
 //this works only when x is an integer >= 0.
 /*
 * Sheng thinks this is more a solution to solve the simple truncate problem
 * (http://www.cs.tut.fi/~jkorpela/round.html) rather than the problem mentioned above.
 * Unfortunately, this solution causes nasty bugs (different results when using O0 and O3).
 * Moreover, round is not correct in CACTI since when an extra fraction of bit/line is needed,
 * we need to provide a complete bit/line even the fraction is just 0.01.
 * So, in later version than 6.5 we use (int)ceil() to get double to int conversion.
 */
 #define EPSILON2 0.1
 #define EPSILON3 0.6
 #define MINSUBARRAYROWS 16 //For simplicity in modeling, for the row decoding structure, we assume
 //that each row predecode block is composed of at least one 2-4 decoder. When the outputs from the
 //row predecode blocks are combined this means that there are at least 4*4=16 row decode outputs
 #define MAXSUBARRAYROWS 262144 //Each row predecode block produces a max of 2^9 outputs. So
 //the maximum number of row decode outputs will be 2^9*2^9
 #define MINSUBARRAYCOLS 2
 #define MAXSUBARRAYCOLS 262144
 #define INV 0
 #define NOR 1
 #define NAND 2
 #define NUMBER_TECH_FLAVORS 4
 #define NUMBER_INTERCONNECT_PROJECTION_TYPES 2 //aggressive and conservative
 //0 = Aggressive projections, 1 = Conservative projections
 #define NUMBER_WIRE_TYPES 4 //local, semi-global and global
 //1 = 'Semi-global' wire type, 2 = 'Global' wire type
 const int dram_cell_tech_flavor = 3;
 #define VBITSENSEMIN 0.08 //minimum bitline sense voltage is fixed to be 80 mV.
 #define fopt 4.0
 #define INPUT_WIRE_TO_INPUT_GATE_CAP_RATIO 0
 #define BUFFER_SEPARATION_LENGTH_MULTIPLIER 1
 #define NUMBER_MATS_PER_REDUNDANT_MAT 8
 #define NUMBER_STACKED_DIE_LAYERS 1
 // this variable can be set to carry out solution optimization for
 // a maximum area allocation.
 #define STACKED_DIE_LAYER_ALLOTED_AREA_mm2 0 //6.24 //6.21//71.5
 // this variable can also be employed when solution optimization
 // with maximum area allocation is carried out.
 #define MAX_PERCENT_AWAY_FROM_ALLOTED_AREA 50
 // this variable can also be employed when solution optimization
 // with maximum area allocation is carried out.
 #define MIN_AREA_EFFICIENCY 20
 // this variable can be employed when solution with a desired
 // aspect ratio is required.
 #define STACKED_DIE_LAYER_ASPECT_RATIO 1
 // this variable can be employed when solution with a desired
 // aspect ratio is required.
 #define MAX_PERCENT_AWAY_FROM_ASPECT_RATIO 101
 // this variable can be employed to carry out solution optimization
 // for a certain target random cycle time.
 #define TARGET_CYCLE_TIME_ns 1000000000
 #define NUMBER_PIPELINE_STAGES 4
 // this can be used to model the length of interconnect
 // between a bank and a crossbar
 #define LENGTH_INTERCONNECT_FROM_BANK_TO_CROSSBAR 0 //3791 // 2880//micron
 #define IS_CROSSBAR 0
 #define NUMBER_INPUT_PORTS_CROSSBAR 8
 #define NUMBER_OUTPUT_PORTS_CROSSBAR 8
 #define NUMBER_SIGNALS_PER_PORT_CROSSBAR 256
 #define MAT_LEAKAGE_REDUCTION_DUE_TO_SLEEP_TRANSISTORS_FACTOR 1
 #define LEAKAGE_REDUCTION_DUE_TO_LONG_CHANNEL_HP_TRANSISTORS_FACTOR 1
 #define PAGE_MODE 0
 #define MAIN_MEM_PER_CHIP_STANDBY_CURRENT_mA 60
 // We are actually not using this variable in the CACTI code. We just want to acknowledge that
 // this current should be multiplied by the DDR(n) system VDD value to compute the standby power
 // consumed during precharge.
 const double VDD_STORAGE_LOSS_FRACTION_WORST = 0.125;
 const double CU_RESISTIVITY = 0.022; //ohm-micron
 const double BULK_CU_RESISTIVITY = 0.018; //ohm-micron
 const double PERMITTIVITY_FREE_SPACE = 8.854e-18; //F/micron
 const static uint32_t sram_num_cells_wl_stitching_ = 16;
 const static uint32_t dram_num_cells_wl_stitching_ = 64;
 const static uint32_t comm_dram_num_cells_wl_stitching_ = 256;
 const static double num_bits_per_ecc_b_          = 8.0;
 const double    bit_to_byte  = 8.0;
 #define MAX_NUMBER_GATES_STAGE 20
 #define MAX_NUMBER_HTREE_NODES 20
 #define NAND2_LEAK_STACK_FACTOR 0.2
 #define NAND3_LEAK_STACK_FACTOR 0.2
 #define NOR2_LEAK_STACK_FACTOR 0.2
 #define INV_LEAK_STACK_FACTOR  0.5
 #define MAX_NUMBER_ARRAY_PARTITIONS 1000000
 // abbreviations used in this project
 // ----------------------------------
 //
 //  num  : number
 //  rw   : read/write
 //  rd   : read
 //  wr   : write
 //  se   : single-ended
 //  sz   : size
 //  F    : feature
 //  w    : width
 //  h    : height or horizontal
 //  v    : vertical or velocity
 enum ram_cell_tech_type_num
 {
  itrs_hp   = 0,
  itrs_lstp = 1,
  itrs_lop  = 2,
  lp_dram   = 3,
  comm_dram = 4
 };
 const double pppm[4]      = {1,1,1,1};
 const double pppm_lkg[4]  = {0,1,1,0};
 const double pppm_dyn[4]  = {1,0,0,0};
 const double pppm_Isub[4] = {0,1,0,0};
 const double pppm_Ig[4]   = {0,0,1,0};
 const double pppm_sc[4]   = {0,0,0,1};
 #endif
--- a/ext/mcpat/cacti/contention.dat
+++ b/ext/mcpat/cacti/contention.dat
@ -0,0 +1,126 @@
 l34c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
 l34c64l2b: 9 11 19 29 43 62 81 102
 l34c64l4b: 6 8 12 17 24 29 39 47
 l34c64l8b: 7 8 10 14 18 22 25 30
 l34c64l16b: 7 7 9 12 14 17 20 24
 l34c64l32b: 7 7 9 12 14 17 20 24 -r
 l34c64l64b: 7 7 9 12 14 17 20 24 -r
 l34c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
 l34c128l2b: 4 10 19 30 44 64 82 103
 l34c128l4b: 3 6 11 17 24 31 38 47
 l34c128l8b: 3 5 9 13 17 21 25 29
 l34c128l16b: 4 5 7 10 13 16 19 22
 l34c128l32b: 4 5 7 10 13 16 19 22 -r
 l34c128l64b: 4 5 7 10 13 16 19 22 -r
 l34c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
 l34c256l2b: 3 10 19 30 44 63 82 103
 l34c256l4b: 3 6 11 17 24 31 38 47
 l34c256l8b: 2 5 8 12 16 20 24 29
 l34c256l16b: 2 4 7 9 12 15 18 21
 l34c256l32b: 2 4 7 9 12 15 18 21 -r
 l34c256l64b: 2 4 7 9 12 15 18 21 -r
 l38c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
 l38c64l2b: 57 59 77 90 137 187 219 245
 l38c64l4b: 35 40 48 56 43 61 80 101
 l38c64l8b: 18 27 41 45 52 58 58 58  -r
 l38c64l16b: 16 17 19 35 40 49 53 53 -r
 l38c64l32b: 15 15 17 19 22 25 30 30 -r
 l38c64l64b: 15 15 17 19 22 25 30 30 -r
 l38c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
 l38c128l2b: 38 50 78 93 139 188 220 245
 l38c128l4b: 29 37 46 56 43 61 81 102
 l38c128l8b: 16 30 39 44 50 57 57 57 -r
 l38c128l16b: 14 16 19 33 40 47 52 52 -r
 l38c128l32b: 14 15 17 20 23 27 31 31 -r
 l38c128l64b: 14 15 17 20 23 27 31 31 -r
 l38c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
 l38c256l2b: 35 50 78 94 139 188 220 246 
 l38c256l4b: 28 36 45 55 55 61 81 102
 l38c256l8b: 17 30 38 43 50 57 57 57 -r
 l38c256l16b: 15 17 21 32 40 47 51 51
 l38c256l32b: 15 17 19 21 24 29 33 33
 l38c256l64b: 15 17 19 21 24 29 33 33 -r
 l316c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
 l316c64l2b: 1000 1000 1000 1000 1000 1000 1000 1000
 l316c64l4b: 34 35 78 126 178 220 252 274
 l316c64l8b: 9 11 23 43 62 87 105 130
 l316c64l16b: 7 9 13 23 33 45 56 67
 l316c64l32b: 5 6 7 10 13 19 25 30
 l316c64l64b: 4 5 6 8 10 14 18 21
 l316c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
 l316c128l2b: 25 131 243 1000 1000 1000 1000 1000
 l316c128l4b: 8 28 79 127 179 221 253 274
 l316c128l8b: 4 9 22 43 62 88 106 131
 l316c128l16b: 4 6 11 21 32 44 55 67
 l316c128l32b: 4 6 11 12 12 18 24 29
 l316c128l64b: 2 3 5 7 9 13 17 21
 l316c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
 l316c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
 l316c256l4b: 5 28 80 128 180 221 253 274
 l316c256l8b: 3 8 22 43 63 88 107 131
 l316c256l16b: 2 5 11 21 32 44 55 67
 l316c256l32b: 2 3 5 8 12 18 24 29
 l316c256l64b: 2 3 4 6 9 13 17 21
 l24c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
 l24c64l2b: 10 12 24 41 60 86 105 122
 l24c64l4b: 5 7 13 20 29 38 47 56
 l24c64l8b: 5 6 9 14 18 24 29 35
 l24c64l16b: 4 5 7 10 12 16 19 22
 l24c64l32b: 5 5 6 8 10 12 14 17
 l24c64l64b: 5 5 6 8 10 12 14 16
 l24c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
 l24c128l2b: 1000 1000 1000 1000 1000 1000 1000 1000
 l24c128l4b: 3 7 13 20 29 38 47 57
 l24c128l8b: 3 5 9 13 18 23 29 35
 l24c128l16b: 3 4 6 9 12 15 19 22
 l24c128l32b: 3 4 5 7 9 11 14 16
 l24c128l64b: 1000 1000 1000 1000 1000 1000 1000 1000
 l24c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
 l24c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
 l24c256l4b: 2 6 13 20 29 38 47 57
 l24c256l8b: 2 4 8 13 18 23 28 35
 l24c256l16b: 2 3 6 8 11 15 18 22
 l24c256l32b: 2 3 5 6 8 11 14 16
 l24c256l64b: 1000 1000 1000 1000 1000 1000 1000 1000
 l28c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
 l28c64l2b: 46 52 117 157 188 225 246 261
 l28c64l4b: 19 25 39 54 96 107 120 150
 l28c64l8b: 9 12 21 30 39 47 58 79
 l28c64l16b: 8 9 11 16 25 32 37 42
 l28c64l32b: 7 8 9 11 14 19 23 28
 l28c64l64b: 7 7 8 10 12 14 18 22 
 l28c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
 l28c128l2b: 1000 1000 1000 1000 1000 1000 1000 1000
 l28c128l4b: 12 22 39 54 98 108 130 151
 l28c128l8b: 7 12 21 30 39 48 59 80
 l28c128l16b: 6 8 11 16 24 31 37 42
 l28c128l32b: 6 7 9 11 14 19 24 28
 l28c128l64b: 6 7 9 11 14 19 24 28
 l28c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
 l28c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
 l28c256l4b: 12 22 39 54 100 108 130 152
 l28c256l8b: 7 12 21 30 39 48 59 81
 l28c256l16b: 6 8 11 16 24 31 37 42
 l28c256l32b: 6 7 9 11 14 19 24 28
 l28c256l64b: 6 7 9 11 14 19 24 28
 l216c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
 l216c64l2b: 1000 1000 1000 1000 1000 1000 1000 1000
 l216c64l4b: 34 35 78 126 178 220 252 274
 l216c64l8b: 9 11 23 43 62 87 105 130
 l216c64l16b: 7 9 13 23 33 45 56 67
 l216c64l32b: 5 6 7 10 13 19 25 30
 l216c64l64b: 4 5 6 8 10 14 18 21
 l216c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
 l216c128l2b: 25 131 243 1000 1000 1000 1000 1000
 l216c128l4b: 8 28 79 127 179 221 253 274
 l216c128l8b: 4 9 22 43 62 88 106 131
 l216c128l16b: 4 6 11 21 32 44 55 67
 l216c128l32b: 4 6 11 12 12 18 24 29
 l216c128l64b: 2 3 5 7 9 13 17 21
 l216c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
 l216c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
 l216c256l4b: 5 28 80 128 180 221 253 274
 l216c256l8b: 3 8 22 43 63 88 107 131
 l216c256l16b: 2 5 11 21 32 44 55 67
 l216c256l32b: 2 3 5 8 12 18 24 29
 l216c256l64b: 2 3 4 6 9 13 17 21
--- a/ext/mcpat/cacti/crossbar.cc
+++ b/ext/mcpat/cacti/crossbar.cc
@ -0,0 +1,161 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include "crossbar.h"
 #define ASPECT_THRESHOLD .8
 #define ADJ 1
 Crossbar::Crossbar(
    double n_inp_,
    double n_out_,
    double flit_size_,
    TechnologyParameter::DeviceType *dt
    ):n_inp(n_inp_), n_out(n_out_), flit_size(flit_size_), deviceType(dt)
 {
  min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
  Vdd = dt->Vdd;
  CB_ADJ = 1;
 }
 Crossbar::~Crossbar(){}
 double Crossbar::output_buffer()
 {
  //Wire winit(4, 4);
  double l_eff = n_inp*flit_size*g_tp.wire_outside_mat.pitch;
  Wire w1(g_ip->wt, l_eff);
  //double s1 = w1.repeater_size *l_eff*ADJ/w1.repeater_spacing;
  double s1 = w1.repeater_size * (l_eff <w1.repeater_spacing?  l_eff *ADJ/w1.repeater_spacing : ADJ);
  double pton_size = deviceType->n_to_p_eff_curr_drv_ratio;
  // the model assumes input capacitance of the wire driver = input capacitance of nand + nor = input cap of the driver transistor
  TriS1 = s1*(1 + pton_size)/(2 + pton_size + 1 + 2*pton_size);
  TriS2 = s1; //driver transistor
  if (TriS1 < 1)
    TriS1 = 1;
  double input_cap = gate_C(TriS1*(2*min_w_pmos + g_tp.min_w_nmos_), 0) +
    gate_C(TriS1*(min_w_pmos + 2*g_tp.min_w_nmos_), 0);
 //  input_cap += drain_C_(TriS1*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
 //    drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)*2 +
 //    gate_C(TriS2*g_tp.min_w_nmos_, 0)+
 //    drain_C_(TriS1*min_w_pmos, NCH, 1, 1, g_tp.cell_h_def)*2 +
 //    drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
 //    gate_C(TriS2*min_w_pmos, 0);
  tri_int_cap = drain_C_(TriS1*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
    drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)*2 +
    gate_C(TriS2*g_tp.min_w_nmos_, 0)+
    drain_C_(TriS1*min_w_pmos, NCH, 1, 1, g_tp.cell_h_def)*2 +
    drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
    gate_C(TriS2*min_w_pmos, 0);
  double output_cap = drain_C_(TriS2*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
    drain_C_(TriS2*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def);
  double ctr_cap = gate_C(TriS2 *(min_w_pmos + g_tp.min_w_nmos_), 0);
  tri_inp_cap = input_cap;
  tri_out_cap = output_cap;
  tri_ctr_cap = ctr_cap;
  return input_cap + output_cap + ctr_cap;
 }
 void Crossbar::compute_power()
 {
  Wire winit(4, 4);
  double tri_cap = output_buffer();
  assert(tri_cap > 0);
  //area of a tristate logic
  double g_area = compute_gate_area(INV, 1, TriS2*g_tp.min_w_nmos_, TriS2*min_w_pmos, g_tp.cell_h_def);
  g_area *= 2; // to model area of output transistors
  g_area += compute_gate_area (NAND, 2, TriS1*2*g_tp.min_w_nmos_, TriS1*min_w_pmos, g_tp.cell_h_def);
  g_area += compute_gate_area (NOR, 2, TriS1*g_tp.min_w_nmos_, TriS1*2*min_w_pmos, g_tp.cell_h_def);
  double width /*per tristate*/ = g_area/(CB_ADJ * g_tp.cell_h_def);
  // effective no. of tristate buffers that need to be laid side by side
  int ntri = (int)ceil(g_tp.cell_h_def/(g_tp.wire_outside_mat.pitch));
  double wire_len = MAX(width*ntri*n_out, flit_size*g_tp.wire_outside_mat.pitch*n_out);
  Wire w1(g_ip->wt, wire_len);
  area.w = wire_len;
  area.h = g_tp.wire_outside_mat.pitch*n_inp*flit_size * CB_ADJ;
  Wire w2(g_ip->wt, area.h);
  double aspect_ratio_cb = (area.h/area.w)*(n_out/n_inp);
  if (aspect_ratio_cb > 1) aspect_ratio_cb = 1/aspect_ratio_cb;
  if (aspect_ratio_cb < ASPECT_THRESHOLD) {
    if (n_out > 2 && n_inp > 2) {
      CB_ADJ+=0.2;
      //cout << "CB ADJ " << CB_ADJ << endl;
      if (CB_ADJ < 4) {
        this->compute_power();
      }
    }
  }
  power.readOp.dynamic = (w1.power.readOp.dynamic + w2.power.readOp.dynamic + (tri_inp_cap * n_out + tri_out_cap * n_inp + tri_ctr_cap + tri_int_cap) * Vdd*Vdd)*flit_size;
  power.readOp.leakage      =  n_inp * n_out * flit_size * (
    cmos_Isub_leakage(g_tp.min_w_nmos_*TriS2*2, min_w_pmos*TriS2*2, 1, inv) *Vdd+
        cmos_Isub_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nand)*Vdd+
        cmos_Isub_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nor) *Vdd+
    w1.power.readOp.leakage + w2.power.readOp.leakage);
  power.readOp.gate_leakage = n_inp * n_out * flit_size * (
          cmos_Ig_leakage(g_tp.min_w_nmos_*TriS2*2, min_w_pmos*TriS2*2, 1, inv) *Vdd+
          cmos_Ig_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nand)*Vdd+
          cmos_Ig_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nor) *Vdd+
          w1.power.readOp.gate_leakage + w2.power.readOp.gate_leakage);
  // delay calculation
  double l_eff = n_inp*flit_size*g_tp.wire_outside_mat.pitch;
  Wire wdriver(g_ip->wt, l_eff);
  double res = g_tp.wire_outside_mat.R_per_um * (area.w+area.h) + tr_R_on(g_tp.min_w_nmos_*wdriver.repeater_size, NCH, 1);
  double cap = g_tp.wire_outside_mat.C_per_um * (area.w + area.h) + n_out*tri_inp_cap + n_inp*tri_out_cap;
  delay = horowitz(w1.signal_rise_time(), res*cap, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE);
  Wire wreset();
 }
 void Crossbar::print_crossbar()
 {
  cout << "\nCrossbar Stats (" << n_inp << "x" << n_out << ")\n\n";
  cout << "Flit size        : " << flit_size << " bits" << endl;
  cout << "Width            : " << area.w << " u" << endl;
  cout << "Height           : " << area.h << " u" << endl;
  cout << "Dynamic Power    : " << power.readOp.dynamic*1e9 * MIN(n_inp, n_out) << " (nJ)" << endl;
  cout << "Leakage Power    : " << power.readOp.leakage*1e3 << " (mW)" << endl;
  cout << "Gate Leakage Power    : " << power.readOp.gate_leakage*1e3 << " (mW)" << endl;
  cout << "Crossbar Delay   : " << delay*1e12 << " ps\n";
 }
--- a/ext/mcpat/cacti/crossbar.h
+++ b/ext/mcpat/cacti/crossbar.h
@ -0,0 +1,85 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __CROSSBAR__
 #define __CROSSBAR__
 #include <assert.h>
 #include <iostream>
 #include "basic_circuit.h"
 #include "cacti_interface.h"
 #include "component.h"
 #include "mat.h"
 #include "parameter.h"
 #include "wire.h"
 class Crossbar : public Component
 {
  public:
    Crossbar(
      double in,
      double out,
      double flit_sz,
      TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
    ~Crossbar();
    void print_crossbar();
    double output_buffer();
    void compute_power();
    double n_inp, n_out;
    double flit_size;
    double tri_inp_cap, tri_out_cap, tri_ctr_cap, tri_int_cap;
  private:
          double CB_ADJ;
          /*
           * Adjust factor of the height of the cross-point (tri-state buffer) cell (layout) in crossbar
           * buffer is adjusted to get an aspect ratio of whole cross bar close to one;
           * when adjust the ratio, the number of wires route over the tri-state buffers does not change,
           * however, the effective wiring pitch changes. Specifically, since CB_ADJ will increase
           * during the adjust, the tri-state buffer will become taller and thiner, and the effective wiring pitch
           * will increase. As a result, the height of the crossbar (area.h) will increase.
           */
        TechnologyParameter::DeviceType *deviceType;
    double TriS1, TriS2;
    double min_w_pmos, Vdd;
 };
 #endif
--- a/ext/mcpat/cacti/decoder.cc
+++ b/ext/mcpat/cacti/decoder.cc
--- a/ext/mcpat/cacti/decoder.h
+++ b/ext/mcpat/cacti/decoder.h
@ -0,0 +1,247 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __DECODER_H__
 #define __DECODER_H__
 #include <vector>
 #include "area.h"
 #include "component.h"
 #include "parameter.h"
 using namespace std;
 class Decoder : public Component
 {
  public:
    Decoder(
        int _num_dec_signals,
        bool flag_way_select,
        double _C_ld_dec_out,
        double _R_wire_dec_out,
        bool fully_assoc_,
        bool is_dram_,
        bool is_wl_tr_,
        const Area & cell_);
    bool   exist;
    int    num_in_signals;
    double C_ld_dec_out;
    double R_wire_dec_out;
    int    num_gates;
    int    num_gates_min;
    double w_dec_n[MAX_NUMBER_GATES_STAGE];
    double w_dec_p[MAX_NUMBER_GATES_STAGE];
    double delay;
    //powerDef power;
    bool   fully_assoc;
    bool   is_dram;
    bool   is_wl_tr;
    const  Area & cell;
    void   compute_widths();
    void   compute_area();
    double compute_delays(double inrisetime);  // return outrisetime
    void leakage_feedback(double temperature);
 };
 class PredecBlk : public Component
 {
 public:
  PredecBlk(
      int num_dec_signals,
      Decoder * dec,
      double C_wire_predec_blk_out,
      double R_wire_predec_blk_out,
      int    num_dec_per_predec,
      bool   is_dram_,
      bool   is_blk1);
  Decoder * dec;
  bool exist;
  int number_input_addr_bits;
  double C_ld_predec_blk_out;
  double R_wire_predec_blk_out;
  int branch_effort_nand2_gate_output;
  int branch_effort_nand3_gate_output;
  bool   flag_two_unique_paths;
  int flag_L2_gate;
  int number_inputs_L1_gate;
  int number_gates_L1_nand2_path;
  int number_gates_L1_nand3_path;
  int number_gates_L2;
  int min_number_gates_L1;
  int min_number_gates_L2;
  int num_L1_active_nand2_path;
  int num_L1_active_nand3_path;
  double w_L1_nand2_n[MAX_NUMBER_GATES_STAGE];
  double w_L1_nand2_p[MAX_NUMBER_GATES_STAGE];
  double w_L1_nand3_n[MAX_NUMBER_GATES_STAGE];
  double w_L1_nand3_p[MAX_NUMBER_GATES_STAGE];
  double w_L2_n[MAX_NUMBER_GATES_STAGE];
  double w_L2_p[MAX_NUMBER_GATES_STAGE];
  double delay_nand2_path;
  double delay_nand3_path;
  powerDef power_nand2_path;
  powerDef power_nand3_path;
  powerDef power_L2;
  bool is_dram_;
  void compute_widths();
  void compute_area();
  void leakage_feedback(double temperature);
  pair<double, double> compute_delays(pair<double, double> inrisetime); // <nand2, nand3>
  // return <outrise_nand2, outrise_nand3>
 };
 class PredecBlkDrv : public Component
 {
 public:
  PredecBlkDrv(
      int   way_select,
      PredecBlk * blk_,
      bool  is_dram);
  int flag_driver_exists;
  int number_input_addr_bits;
  int number_gates_nand2_path;
  int number_gates_nand3_path;
  int min_number_gates;
  int num_buffers_driving_1_nand2_load;
  int num_buffers_driving_2_nand2_load;
  int num_buffers_driving_4_nand2_load;
  int num_buffers_driving_2_nand3_load;
  int num_buffers_driving_8_nand3_load;
  int num_buffers_nand3_path;
  double c_load_nand2_path_out;
  double c_load_nand3_path_out;
  double r_load_nand2_path_out;
  double r_load_nand3_path_out;
  double width_nand2_path_n[MAX_NUMBER_GATES_STAGE];
  double width_nand2_path_p[MAX_NUMBER_GATES_STAGE];
  double width_nand3_path_n[MAX_NUMBER_GATES_STAGE];
  double width_nand3_path_p[MAX_NUMBER_GATES_STAGE];
  double delay_nand2_path;
  double delay_nand3_path;
  powerDef power_nand2_path;
  powerDef power_nand3_path;
  PredecBlk * blk;
  Decoder   * dec;
  bool  is_dram_;
  int   way_select;
  void compute_widths();
  void compute_area();
  void leakage_feedback(double temperature);
  pair<double, double> compute_delays(
      double inrisetime_nand2_path,
      double inrisetime_nand3_path);  // return <outrise_nand2, outrise_nand3>
  inline int num_addr_bits_nand2_path()
  {
    return num_buffers_driving_1_nand2_load +
           num_buffers_driving_2_nand2_load +
           num_buffers_driving_4_nand2_load;
  }
  inline int num_addr_bits_nand3_path()
  {
    return num_buffers_driving_2_nand3_load +
           num_buffers_driving_8_nand3_load;
  }
  double get_rdOp_dynamic_E(int num_act_mats_hor_dir);
 };
 class Predec : public Component
 {
  public:
    Predec(
        PredecBlkDrv * drv1,
        PredecBlkDrv * drv2);
    double compute_delays(double inrisetime);  // return outrisetime
    void leakage_feedback(double temperature);
    PredecBlk    * blk1;
    PredecBlk    * blk2;
    PredecBlkDrv * drv1;
    PredecBlkDrv * drv2;
    powerDef block_power;
    powerDef driver_power;
  private:
    // returns <delay, risetime>
    pair<double, double> get_max_delay_before_decoder(
        pair<double, double> input_pair1,
        pair<double, double> input_pair2);
 };
 class Driver : public Component
 {
 public:
  Driver(double c_gate_load_, double c_wire_load_, double r_wire_load_, bool is_dram);
  int    number_gates;
  int    min_number_gates;
  double width_n[MAX_NUMBER_GATES_STAGE];
  double width_p[MAX_NUMBER_GATES_STAGE];
  double c_gate_load;
  double c_wire_load;
  double r_wire_load;
  double delay;
  powerDef power;
  bool   is_dram_;
  void   compute_widths();
  double compute_delay(double inrisetime);
 };
 #endif
--- a/ext/mcpat/cacti/htree2.cc
+++ b/ext/mcpat/cacti/htree2.cc
@ -0,0 +1,641 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <cassert>
 #include <iostream>
 #include "htree2.h"
 #include "wire.h"
 Htree2::Htree2(
    enum Wire_type wire_model, double mat_w, double mat_h,
    int a_bits, int d_inbits, int search_data_in, int d_outbits, int search_data_out, int bl, int wl, enum Htree_type htree_type,
    bool uca_tree_, bool search_tree_, TechnologyParameter::DeviceType *dt)
 :in_rise_time(0), out_rise_time(0),
  tree_type(htree_type), mat_width(mat_w), mat_height(mat_h),
  add_bits(a_bits), data_in_bits(d_inbits), search_data_in_bits(search_data_in),data_out_bits(d_outbits),
  search_data_out_bits(search_data_out), ndbl(bl), ndwl(wl),
  uca_tree(uca_tree_), search_tree(search_tree_), wt(wire_model), deviceType(dt)
 {
  assert(ndbl >= 2 && ndwl >= 2);
 //  if (ndbl == 1 && ndwl == 1)
 //  {
 //    delay = 0;
 //    power.readOp.dynamic = 0;
 //    power.readOp.leakage = 0;
 //    area.w = mat_w;
 //    area.h = mat_h;
 //    return;
 //  }
 //  if (ndwl == 1) ndwl++;
 //  if (ndbl == 1) ndbl++;
  max_unpipelined_link_delay = 0; //TODO
  min_w_nmos = g_tp.min_w_nmos_;
  min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio * min_w_nmos;
  switch (htree_type)
  {
    case Add_htree:
      wire_bw = init_wire_bw = add_bits;
      in_htree();
      break;
    case Data_in_htree:
      wire_bw = init_wire_bw = data_in_bits;
      in_htree();
      break;
    case Data_out_htree:
      wire_bw = init_wire_bw = data_out_bits;
      out_htree();
      break;
    case Search_in_htree:
      wire_bw = init_wire_bw = search_data_in_bits;//in_search_tree is broad cast, out_htree is not.
      in_htree();
      break;
    case Search_out_htree:
      wire_bw = init_wire_bw = search_data_out_bits;
      out_htree();
      break;
    default:
      assert(0);
      break;
  }
  power_bit = power;
  power.readOp.dynamic *= init_wire_bw;
  assert(power.readOp.dynamic >= 0);
  assert(power.readOp.leakage >= 0);
 }
 // nand gate sizing calculation
 void Htree2::input_nand(double s1, double s2, double l_eff)
 {
  Wire w1(wt, l_eff);
  double pton_size = deviceType->n_to_p_eff_curr_drv_ratio;
  // input capacitance of a repeater  = input capacitance of nand.
  double nsize = s1*(1 + pton_size)/(2 + pton_size);
  nsize = (nsize < 1) ? 1 : nsize;
  double tc = 2*tr_R_on(nsize*min_w_nmos, NCH, 1) *
    (drain_C_(nsize*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)*2 +
     2 * gate_C(s2*(min_w_nmos + min_w_pmos), 0));
  delay+= horowitz (w1.out_rise_time, tc,
      deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE);
  power.readOp.dynamic += 0.5 *
    (2*drain_C_(pton_size * nsize*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
     + drain_C_(nsize*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
     + 2*gate_C(s2*(min_w_nmos + min_w_pmos), 0)) *
    deviceType->Vdd * deviceType->Vdd;
    power.searchOp.dynamic += 0.5 *
    (2*drain_C_(pton_size * nsize*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
     + drain_C_(nsize*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
     + 2*gate_C(s2*(min_w_nmos + min_w_pmos), 0)) *
    deviceType->Vdd * deviceType->Vdd * wire_bw ;
  power.readOp.leakage += (wire_bw*cmos_Isub_leakage(min_w_nmos*(nsize*2), min_w_pmos * nsize * 2, 2, nand))*deviceType->Vdd;
  power.readOp.gate_leakage += (wire_bw*cmos_Ig_leakage(min_w_nmos*(nsize*2), min_w_pmos * nsize * 2, 2, nand))*deviceType->Vdd;
 }
 // tristate buffer model consisting of not, nand, nor, and driver transistors
 void Htree2::output_buffer(double s1, double s2, double l_eff)
 {
  Wire w1(wt, l_eff);
  double pton_size = deviceType->n_to_p_eff_curr_drv_ratio;
  // input capacitance of repeater = input capacitance of nand + nor.
  double size = s1*(1 + pton_size)/(2 + pton_size + 1 + 2*pton_size);
  double s_eff =  //stage eff of a repeater in a wire
    (gate_C(s2*(min_w_nmos + min_w_pmos), 0) + w1.wire_cap(l_eff*1e-6,true))/
    gate_C(s2*(min_w_nmos + min_w_pmos), 0);
  double tr_size = gate_C(s1*(min_w_nmos + min_w_pmos), 0) * 1/2/(s_eff*gate_C(min_w_pmos, 0));
  size = (size < 1) ? 1 : size;
  double res_nor = 2*tr_R_on(size*min_w_pmos, PCH, 1);
  double res_ptrans = tr_R_on(tr_size*min_w_nmos, NCH, 1);
  double cap_nand_out = drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) +
                        drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)*2 +
                        gate_C(tr_size*min_w_pmos, 0);
  double cap_ptrans_out = 2 *(drain_C_(tr_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
                              drain_C_(tr_size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)) +
                          gate_C(s1*(min_w_nmos + min_w_pmos), 0);
  double tc = res_nor * cap_nand_out + (res_nor + res_ptrans) * cap_ptrans_out;
  delay += horowitz (w1.out_rise_time, tc,
      deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE);
  //nand
  power.readOp.dynamic += 0.5 *
    (2*drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
       drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) +
     gate_C(tr_size*(min_w_pmos), 0)) *
    deviceType->Vdd * deviceType->Vdd;
    power.searchOp.dynamic += 0.5 *
    (2*drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
       drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) +
     gate_C(tr_size*(min_w_pmos), 0)) *
    deviceType->Vdd * deviceType->Vdd*init_wire_bw;
  //not
  power.readOp.dynamic += 0.5 *
    (drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
     +drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
     +gate_C(size*(min_w_nmos + min_w_pmos), 0)) *
    deviceType->Vdd * deviceType->Vdd;
    power.searchOp.dynamic += 0.5 *
    (drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
     +drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
     +gate_C(size*(min_w_nmos + min_w_pmos), 0)) *
    deviceType->Vdd * deviceType->Vdd*init_wire_bw;
  //nor
  power.readOp.dynamic += 0.5 *
    (drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
     + 2*drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
     +gate_C(tr_size*(min_w_nmos + min_w_pmos), 0)) *
    deviceType->Vdd * deviceType->Vdd;
    power.searchOp.dynamic += 0.5 *
    (drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
     + 2*drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
     +gate_C(tr_size*(min_w_nmos + min_w_pmos), 0)) *
    deviceType->Vdd * deviceType->Vdd*init_wire_bw;
  //output transistor
  power.readOp.dynamic += 0.5 *
    ((drain_C_(tr_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
      +drain_C_(tr_size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def))*2
     + gate_C(s1*(min_w_nmos + min_w_pmos), 0)) *
    deviceType->Vdd * deviceType->Vdd;
    power.searchOp.dynamic += 0.5 *
    ((drain_C_(tr_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
      +drain_C_(tr_size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def))*2
     + gate_C(s1*(min_w_nmos + min_w_pmos), 0)) *
    deviceType->Vdd * deviceType->Vdd*init_wire_bw;
  if(uca_tree) {
        power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
        power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
        power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
        power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
    power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
    power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
    //power.readOp.gate_leakage *=;
  }
  else {
        power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
        power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
        power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
        power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
    power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
    power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
    //power.readOp.gate_leakage *=deviceType->Vdd*wire_bw;
  }
 }
 /* calculates the input h-tree delay/power
 * A nand gate is used at each node to
 * limit the signal
 * The area of an unbalanced htree (rows != columns)
 * depends on how data is traversed.
 * In the following function, if ( no. of rows < no. of columns),
 * then data first traverse in excess hor. links until vertical
 * and horizontal nodes are same.
 * If no. of rows is bigger, then data traverse in
 * a hor. link followed by a ver. link in a repeated
 * fashion (similar to a balanced tree) until there are no
 * hor. links left. After this it goes through the remaining vertical
 * links.
 */
  void
 Htree2::in_htree()
 {
  //temp var
  double s1 = 0, s2 = 0, s3 = 0;
  double l_eff = 0;
  Wire *wtemp1 = 0, *wtemp2 = 0, *wtemp3 = 0;
  double len = 0, ht = 0;
  int option = 0;
  int h = (int) _log2(ndwl/2); // horizontal nodes
  int v = (int) _log2(ndbl/2); // vertical nodes
  double len_temp;
  double ht_temp;
  if (uca_tree)
  {//Sheng: this computation do not consider the wires that route from edge to middle.
    ht_temp = (mat_height*ndbl/2 +/* since uca_tree models interbank tree, mat_height => bank height */
        ((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
         2 * (1-pow(0.5,h))))/2;
    len_temp = (mat_width*ndwl/2 +
        ((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
         2 * (1-pow(0.5,v))))/2;
  }
  else
  {
    if (ndwl == ndbl) {
      ht_temp = ((mat_height*ndbl/2) +
          ((add_bits + (search_data_in_bits + search_data_out_bits))* (ndbl/2-1) * g_tp.wire_outside_mat.pitch) +
          ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
          )/2;
      len_temp = (mat_width*ndwl/2 +
        ((add_bits + (search_data_in_bits + search_data_out_bits)) * (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
        ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
    }
    else if (ndwl > ndbl) {
      double excess_part = (_log2(ndwl/2) - _log2(ndbl/2));
      ht_temp = ((mat_height*ndbl/2) +
          ((add_bits + + (search_data_in_bits + search_data_out_bits)) * ((ndbl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
          (data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch *
          (2*(1 - pow(0.5, h-v)) + pow(0.5, v-h) * v))/2;
      len_temp = (mat_width*ndwl/2 +
        ((add_bits + (search_data_in_bits + search_data_out_bits))* (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
        ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
    }
    else {
       double excess_part = (_log2(ndbl/2) - _log2(ndwl/2));
      ht_temp = ((mat_height*ndbl/2) +
          ((add_bits + (search_data_in_bits + search_data_out_bits))* ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
          ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
          )/2;
      len_temp = (mat_width*ndwl/2 +
          ((add_bits + (search_data_in_bits + search_data_out_bits)) * ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
          (data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * (h + 2*(1-pow(0.5, v-h))))/2;
    }
  }
  area.h   = ht_temp * 2;
  area.w   = len_temp * 2;
  delay = 0;
  power.readOp.dynamic = 0;
  power.readOp.leakage = 0;
  power.searchOp.dynamic =0;
  len = len_temp;
  ht  = ht_temp/2;
  while (v > 0 || h > 0)
  {
    if (wtemp1) delete wtemp1;
    if (wtemp2) delete wtemp2;
    if (wtemp3) delete wtemp3;
    if (h > v)
    {
      //the iteration considers only one horizontal link
      wtemp1 = new Wire(wt, len); // hor
      wtemp2 = new Wire(wt, len/2);  // ver
      len_temp = len;
      len /= 2;
      wtemp3 = 0;
      h--;
      option = 0;
    }
    else if (v>0 && h>0)
    {
      //considers one horizontal link and one vertical link
      wtemp1 = new Wire(wt, len); // hor
      wtemp2 = new Wire(wt, ht);  // ver
      wtemp3 = new Wire(wt, len/2);  // next hor
      len_temp = len;
      ht_temp = ht;
      len /= 2;
      ht  /= 2;
      v--;
      h--;
      option = 1;
    }
    else
    {
      // considers only one vertical link
      assert(h == 0);
      wtemp1 = new Wire(wt, ht); // ver
      wtemp2 = new Wire(wt, ht/2);  // hor
      ht_temp = ht;
      ht /= 2;
      wtemp3 = 0;
      v--;
      option = 2;
    }
    delay += wtemp1->delay;
    power.readOp.dynamic += wtemp1->power.readOp.dynamic;
    power.searchOp.dynamic += wtemp1->power.readOp.dynamic*wire_bw;
    power.readOp.leakage += wtemp1->power.readOp.leakage*wire_bw;
    power.readOp.gate_leakage += wtemp1->power.readOp.gate_leakage*wire_bw;
    if ((uca_tree == false && option == 2) || search_tree==true)
    {
      wire_bw*=2;  // wire bandwidth doubles only for vertical branches
    }
    if (uca_tree == false)
    {
      if (len_temp > wtemp1->repeater_spacing)
      {
        s1 = wtemp1->repeater_size;
        l_eff = wtemp1->repeater_spacing;
      }
      else
      {
        s1 = (len_temp/wtemp1->repeater_spacing) * wtemp1->repeater_size;
        l_eff = len_temp;
      }
      if (ht_temp > wtemp2->repeater_spacing)
      {
        s2 = wtemp2->repeater_size;
      }
      else
      {
        s2 = (len_temp/wtemp2->repeater_spacing) * wtemp2->repeater_size;
      }
      // first level
      input_nand(s1, s2, l_eff);
    }
    if (option != 1)
    {
      continue;
    }
    // second level
    delay += wtemp2->delay;
    power.readOp.dynamic += wtemp2->power.readOp.dynamic;
    power.searchOp.dynamic += wtemp2->power.readOp.dynamic*wire_bw;
    power.readOp.leakage += wtemp2->power.readOp.leakage*wire_bw;
    power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
    if (uca_tree)
    {
      power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
      power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
    }
    else
    {
      power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
      power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
      wire_bw*=2;
      if (ht_temp > wtemp3->repeater_spacing)
      {
        s3    = wtemp3->repeater_size;
        l_eff = wtemp3->repeater_spacing;
      }
      else
      {
        s3    = (len_temp/wtemp3->repeater_spacing) * wtemp3->repeater_size;
        l_eff = ht_temp;
      }
      input_nand(s2, s3, l_eff);
    }
  }
  if (wtemp1) delete wtemp1;
  if (wtemp2) delete wtemp2;
  if (wtemp3) delete wtemp3;
 }
 /* a tristate buffer is used to handle fan-ins
 * The area of an unbalanced htree (rows != columns)
 * depends on how data is traversed.
 * In the following function, if ( no. of rows < no. of columns),
 * then data first traverse in excess hor. links until vertical
 * and horizontal nodes are same.
 * If no. of rows is bigger, then data traverse in
 * a hor. link followed by a ver. link in a repeated
 * fashion (similar to a balanced tree) until there are no
 * hor. links left. After this it goes through the remaining vertical
 * links.
 */
 void Htree2::out_htree()
 {
  //temp var
  double s1 = 0, s2 = 0, s3 = 0;
  double l_eff = 0;
  Wire *wtemp1 = 0, *wtemp2 = 0, *wtemp3 = 0;
  double len = 0, ht = 0;
  int option = 0;
  int h = (int) _log2(ndwl/2);
  int v = (int) _log2(ndbl/2);
  double len_temp;
  double ht_temp;
  if (uca_tree)
  {
    ht_temp = (mat_height*ndbl/2 +/* since uca_tree models interbank tree, mat_height => bank height */
        ((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
         2 * (1-pow(0.5,h))))/2;
    len_temp = (mat_width*ndwl/2 +
        ((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
         2 * (1-pow(0.5,v))))/2;
  }
  else
    {
    if (ndwl == ndbl) {
      ht_temp = ((mat_height*ndbl/2) +
          ((add_bits+ (search_data_in_bits + search_data_out_bits)) * (ndbl/2-1) * g_tp.wire_outside_mat.pitch) +
          ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
          )/2;
      len_temp = (mat_width*ndwl/2 +
        ((add_bits + (search_data_in_bits + search_data_out_bits)) * (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
        ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
    }
    else if (ndwl > ndbl) {
      double excess_part = (_log2(ndwl/2) - _log2(ndbl/2));
      ht_temp = ((mat_height*ndbl/2) +
          ((add_bits + (search_data_in_bits + search_data_out_bits)) * ((ndbl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
          (data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch *
          (2*(1 - pow(0.5, h-v)) + pow(0.5, v-h) * v))/2;
      len_temp = (mat_width*ndwl/2 +
        ((add_bits + (search_data_in_bits + search_data_out_bits))* (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
        ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
    }
    else {
      double excess_part = (_log2(ndbl/2) - _log2(ndwl/2));
      ht_temp = ((mat_height*ndbl/2) +
          ((add_bits + (search_data_in_bits + search_data_out_bits))* ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
          ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
          )/2;
      len_temp = (mat_width*ndwl/2 +
          ((add_bits + (search_data_in_bits + search_data_out_bits))* ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
          (data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * (h + 2*(1-pow(0.5, v-h))))/2;
    }
  }
  area.h = ht_temp * 2;
  area.w = len_temp * 2;
  delay = 0;
  power.readOp.dynamic = 0;
  power.readOp.leakage = 0;
  power.readOp.gate_leakage = 0;
  //cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
  len = len_temp;
  ht = ht_temp/2;
  while (v > 0 || h > 0)
  { //finds delay/power of each link in the tree
    if (wtemp1) delete wtemp1;
    if (wtemp2) delete wtemp2;
    if (wtemp3) delete wtemp3;
    if(h > v) {
      //the iteration considers only one horizontal link
      wtemp1 = new Wire(wt, len); // hor
      wtemp2 = new Wire(wt, len/2);  // ver
      len_temp = len;
      len /= 2;
      wtemp3 = 0;
      h--;
      option = 0;
    }
    else if (v>0 && h>0) {
      //considers one horizontal link and one vertical link
      wtemp1 = new Wire(wt, len); // hor
      wtemp2 = new Wire(wt, ht);  // ver
      wtemp3 = new Wire(wt, len/2);  // next hor
      len_temp = len;
      ht_temp = ht;
      len /= 2;
      ht /= 2;
      v--;
      h--;
      option = 1;
    }
    else {
      // considers only one vertical link
      assert(h == 0);
      wtemp1 = new Wire(wt, ht); // hor
      wtemp2 = new Wire(wt, ht/2);  // ver
      ht_temp = ht;
      ht /= 2;
      wtemp3 = 0;
      v--;
      option = 2;
    }
    delay += wtemp1->delay;
    power.readOp.dynamic += wtemp1->power.readOp.dynamic;
    power.searchOp.dynamic += wtemp1->power.readOp.dynamic*init_wire_bw;
    power.readOp.leakage += wtemp1->power.readOp.leakage*wire_bw;
    power.readOp.gate_leakage += wtemp1->power.readOp.gate_leakage*wire_bw;
    //cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
    if ((uca_tree == false && option == 2) || search_tree==true)
    {
      wire_bw*=2;
    }
    if (uca_tree == false)
    {
      if (len_temp > wtemp1->repeater_spacing)
      {
        s1 = wtemp1->repeater_size;
        l_eff = wtemp1->repeater_spacing;
      }
      else
      {
        s1 = (len_temp/wtemp1->repeater_spacing) * wtemp1->repeater_size;
        l_eff = len_temp;
      }
      if (ht_temp > wtemp2->repeater_spacing)
      {
        s2 = wtemp2->repeater_size;
      }
      else
      {
        s2 = (len_temp/wtemp2->repeater_spacing) * wtemp2->repeater_size;
      }
      // first level
      output_buffer(s1, s2, l_eff);
    }
    if (option != 1)
    {
      continue;
    }
    // second level
    delay += wtemp2->delay;
    power.readOp.dynamic += wtemp2->power.readOp.dynamic;
    power.searchOp.dynamic += wtemp2->power.readOp.dynamic*init_wire_bw;
    power.readOp.leakage += wtemp2->power.readOp.leakage*wire_bw;
    power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
    //cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
    if (uca_tree)
    {
      power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
      power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
    }
    else
    {
      power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
      power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
      wire_bw*=2;
      if (ht_temp > wtemp3->repeater_spacing)
      {
        s3 = wtemp3->repeater_size;
        l_eff = wtemp3->repeater_spacing;
      }
      else
      {
        s3 = (len_temp/wtemp3->repeater_spacing) * wtemp3->repeater_size;
        l_eff = ht_temp;
      }
      output_buffer(s2, s3, l_eff);
    }
    //cout<<"power.readOp.leakage"<<power.readOp.leakage<<endl;
    //cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
    //cout<<"wtemp2->power.readOp.gate_leakage"<<wtemp2->power.readOp.gate_leakage<<endl;
  }
  if (wtemp1) delete wtemp1;
  if (wtemp2) delete wtemp2;
  if (wtemp3) delete wtemp3;
 }
--- a/ext/mcpat/cacti/htree2.h
+++ b/ext/mcpat/cacti/htree2.h
@ -0,0 +1,97 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __HTREE2_H__
 #define __HTREE2_H__
 #include "assert.h"
 #include "basic_circuit.h"
 #include "cacti_interface.h"
 #include "component.h"
 #include "parameter.h"
 #include "subarray.h"
 #include "wire.h"
 // leakge power includes entire htree in a bank (when uca_tree == false)
 // leakge power includes only part to one bank when uca_tree == true
 class Htree2 : public Component
 {
  public:
    Htree2(enum Wire_type wire_model,
        double mat_w, double mat_h, int add, int data_in, int search_data_in, int data_out, int search_data_out, int bl, int wl,
        enum Htree_type h_type, bool uca_tree_ = false, bool search_tree_ = false,
        TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
    ~Htree2() {};
    void in_htree();
    void out_htree();
    // repeaters only at h-tree nodes
    void limited_in_htree();
    void limited_out_htree();
    void input_nand(double s1, double s2, double l);
    void output_buffer(double s1, double s2, double l);
    double in_rise_time, out_rise_time;
    void set_in_rise_time(double rt)
    {
      in_rise_time = rt;
    }
    double max_unpipelined_link_delay;
    powerDef power_bit;
  private:
    double wire_bw;
    double init_wire_bw;  // bus width at root
    enum Htree_type tree_type;
    double htree_hnodes;
    double htree_vnodes;
    double mat_width;
    double mat_height;
    int add_bits, data_in_bits,search_data_in_bits,data_out_bits,  search_data_out_bits;
    int ndbl, ndwl;
    bool uca_tree; // should have full bandwidth to access all banks in the array simultaneously
    bool search_tree;
    enum Wire_type wt;
    double min_w_nmos;
    double min_w_pmos;
    TechnologyParameter::DeviceType *deviceType;
 };
 #endif
--- a/ext/mcpat/cacti/io.cc
+++ b/ext/mcpat/cacti/io.cc
--- a/ext/mcpat/cacti/io.h
+++ b/ext/mcpat/cacti/io.h
@ -0,0 +1,44 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __IO_H__
 #define __IO_H__
 #include "cacti_interface.h"
 #include "const.h"
 void output_data_csv(const uca_org_t & fin_res);
 void output_UCA(uca_org_t * fin_res);
 #endif
--- a/ext/mcpat/cacti/main.cc
+++ b/ext/mcpat/cacti/main.cc
@ -0,0 +1,191 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <iostream>
 #include "io.h"
 using namespace std;
 int main(int argc,char *argv[])
 {
  uca_org_t result;
  if (argc != 53 && argc != 55)
  {
    bool infile_specified = false;
    string infile_name("");
    for (int32_t i = 0; i < argc; i++)
    {
      if (argv[i] == string("-infile"))
      {
        infile_specified = true;
        i++;
        infile_name = argv[i];
      }
    }
    if (infile_specified == false)
    {
      cerr << " Invalid arguments -- how to use CACTI:" << endl;
      cerr << "  1) cacti -infile <input file name>" << endl;
      cerr << "  2) cacti arg1 ... arg52 -- please refer to the README file" << endl;
      cerr << " No. of arguments input - " << argc << endl;
      exit(1);
    }
    else
    {
      result = cacti_interface(infile_name);
    }
  }
  else if (argc == 53)
  {
          result = cacti_interface(atoi(argv[ 1]),
                          atoi(argv[ 2]),
                          atoi(argv[ 3]),
                          atoi(argv[ 4]),
                          atoi(argv[ 5]),
                          atoi(argv[ 6]),
                          atoi(argv[ 7]),
                          atoi(argv[ 8]),
                          atoi(argv[ 9]),
                          atof(argv[10]),
                          atoi(argv[11]),
                          atoi(argv[12]),
                          atoi(argv[13]),
                          atoi(argv[14]),
                          atoi(argv[15]),
                          atoi(argv[16]),
                          atoi(argv[17]),
                          atoi(argv[18]),
                          atoi(argv[19]),
                          atoi(argv[20]),
                          atoi(argv[21]),
                          atoi(argv[22]),
                          atoi(argv[23]),
                          atoi(argv[24]),
                          atoi(argv[25]),
                          atoi(argv[26]),
                          atoi(argv[27]),
                          atoi(argv[28]),
                          atoi(argv[29]),
                          atoi(argv[30]),
                          atoi(argv[31]),
                          atoi(argv[32]),
                          atoi(argv[33]),
                          atoi(argv[34]),
                          atoi(argv[35]),
                          atoi(argv[36]),
                          atoi(argv[37]),
                          atoi(argv[38]),
                          atoi(argv[39]),
                          atoi(argv[40]),
                          atoi(argv[41]),
                          atoi(argv[42]),
                          atoi(argv[43]),
                          atoi(argv[44]),
                          atoi(argv[45]),
                          atoi(argv[46]),
                          atoi(argv[47]),
                          atoi(argv[48]),
                          atoi(argv[49]),
                          atoi(argv[50]),
                          atoi(argv[51]),
                          atoi(argv[52]));
  }
  else
  {
          result = cacti_interface(atoi(argv[ 1]),
                          atoi(argv[ 2]),
                          atoi(argv[ 3]),
                          atoi(argv[ 4]),
                          atoi(argv[ 5]),
                          atoi(argv[ 6]),
                          atoi(argv[ 7]),
                          atoi(argv[ 8]),
                          atof(argv[ 9]),
                          atoi(argv[10]),
                          atoi(argv[11]),
                          atoi(argv[12]),
                          atoi(argv[13]),
                          atoi(argv[14]),
                          atoi(argv[15]),
                          atoi(argv[16]),
                          atoi(argv[17]),
                          atoi(argv[18]),
                          atoi(argv[19]),
                          atoi(argv[20]),
                          atoi(argv[21]),
                          atoi(argv[22]),
                          atoi(argv[23]),
                          atoi(argv[24]),
                          atoi(argv[25]),
                          atoi(argv[26]),
                          atoi(argv[27]),
                          atoi(argv[28]),
                          atoi(argv[29]),
                          atoi(argv[30]),
                          atoi(argv[31]),
                          atoi(argv[32]),
                          atoi(argv[33]),
                          atoi(argv[34]),
                          atoi(argv[35]),
                          atoi(argv[36]),
                          atoi(argv[37]),
                          atoi(argv[38]),
                          atoi(argv[39]),
                          atoi(argv[40]),
                          atoi(argv[41]),
                          atoi(argv[42]),
                          atoi(argv[43]),
                          atoi(argv[44]),
                          atoi(argv[45]),
                          atoi(argv[46]),
                          atoi(argv[47]),
                          atoi(argv[48]),
                          atoi(argv[49]),
                          atoi(argv[50]),
                          atoi(argv[51]),
                          atoi(argv[52]),
                          atoi(argv[53]),
                          atoi(argv[54]));
  }
  result.cleanup();
 //  delete result.data_array2;
 //  if (result.tag_array2!=NULL)
 //	  delete result.tag_array2;
  return 0;
 }
--- a/ext/mcpat/cacti/makefile
+++ b/ext/mcpat/cacti/makefile
@ -0,0 +1,28 @@
 TAR = cacti
 .PHONY: dbg opt depend clean clean_dbg clean_opt
 all: opt
 dbg: $(TAR).mk obj_dbg
 	@$(MAKE) TAG=dbg -C . -f $(TAR).mk
 opt: $(TAR).mk obj_opt
 	@$(MAKE) TAG=opt -C . -f $(TAR).mk
 obj_dbg:
 	mkdir $@
 obj_opt:
 	mkdir $@
 clean: clean_dbg clean_opt
 clean_dbg: obj_dbg
 	@$(MAKE) TAG=dbg -C . -f $(TAR).mk clean
 	rm -rf $<
 clean_opt: obj_opt
 	@$(MAKE) TAG=opt -C . -f $(TAR).mk clean
 	rm -rf $<
--- a/ext/mcpat/cacti/mat.cc
+++ b/ext/mcpat/cacti/mat.cc
--- a/ext/mcpat/cacti/mat.h
+++ b/ext/mcpat/cacti/mat.h
@ -0,0 +1,148 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __MAT_H__
 #define __MAT_H__
 #include "component.h"
 #include "decoder.h"
 #include "subarray.h"
 #include "wire.h"
 class Mat : public Component
 {
  public:
    Mat(const DynamicParameter & dyn_p);
    ~Mat();
    double compute_delays(double inrisetime);  // return outrisetime
    void compute_power_energy();
    const DynamicParameter & dp;
    // TODO: clean up pointers and powerDefs below
    Decoder * row_dec;
    Decoder * bit_mux_dec;
    Decoder * sa_mux_lev_1_dec;
    Decoder * sa_mux_lev_2_dec;
    PredecBlk * dummy_way_sel_predec_blk1;
    PredecBlk * dummy_way_sel_predec_blk2;
    PredecBlkDrv * way_sel_drv1;
    PredecBlkDrv * dummy_way_sel_predec_blk_drv2;
    Predec * r_predec;
    Predec * b_mux_predec;
    Predec * sa_mux_lev_1_predec;
    Predec * sa_mux_lev_2_predec;
    Wire   * subarray_out_wire;
    Driver * bl_precharge_eq_drv;
    Driver * cam_bl_precharge_eq_drv;//bitline pre-charge circuit is separated for CAM and RAM arrays.
    Driver * ml_precharge_drv;//matchline prechange driver
    Driver * sl_precharge_eq_drv;//searchline prechage driver
    Driver * sl_data_drv;//search line data driver
    Driver * ml_to_ram_wl_drv;//search line data driver
    powerDef power_row_decoders;
    powerDef power_bit_mux_decoders;
    powerDef power_sa_mux_lev_1_decoders;
    powerDef power_sa_mux_lev_2_decoders;
    powerDef power_fa_cam;  // TODO: leakage power is not computed yet
    powerDef power_bl_precharge_eq_drv;
    powerDef power_subarray_out_drv;
    powerDef power_cam_all_active;
    powerDef power_searchline_precharge;
    powerDef power_matchline_precharge;
    powerDef power_ml_to_ram_wl_drv;
    double   delay_fa_tag, delay_cam;
    double   delay_before_decoder;
    double   delay_bitline;
    double   delay_wl_reset;
    double   delay_bl_restore;
    double   delay_searchline;
    double   delay_matchchline;
    double   delay_cam_sl_restore;
    double   delay_cam_ml_reset;
    double   delay_fa_ram_wl;
    double   delay_hit_miss_reset;
    double   delay_hit_miss;
    Subarray subarray;
    powerDef power_bitline, power_searchline, power_matchline;
    double   per_bitline_read_energy;
    int      deg_bl_muxing;
    int      num_act_mats_hor_dir;
    double   delay_writeback;
    Area     cell,cam_cell;
    bool     is_dram,is_fa, pure_cam, camFlag;
    int      num_mats;
    powerDef power_sa;
    double   delay_sa;
    double   leak_power_sense_amps_closed_page_state;
    double   leak_power_sense_amps_open_page_state;
    double   delay_subarray_out_drv;
    double   delay_subarray_out_drv_htree;
    double   delay_comparator;
    powerDef power_comparator;
    int      num_do_b_mat;
    int      num_so_b_mat;
    int      num_sa_subarray;
    int      num_sa_subarray_search;
    double   C_bl;
    uint32_t num_subarrays_per_mat;  // the number of subarrays in a mat
    uint32_t num_subarrays_per_row;  // the number of subarrays in a row of a mat
  private:
    double compute_bit_mux_sa_precharge_sa_mux_wr_drv_wr_mux_h();
    double width_write_driver_or_write_mux();
    double compute_comparators_height(int tagbits, int number_ways_in_mat, double subarray_mem_cell_area_w);
    double compute_cam_delay(double inrisetime);
    double compute_bitline_delay(double inrisetime);
    double compute_sa_delay(double inrisetime);
    double compute_subarray_out_drv(double inrisetime);
    double compute_comparator_delay(double inrisetime);
    int RWP;
    int ERP;
    int EWP;
    int SCHP;
 };
 #endif
--- a/ext/mcpat/cacti/nuca.cc
+++ b/ext/mcpat/cacti/nuca.cc
@ -0,0 +1,612 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <cassert>
 #include "Ucache.h"
 #include "nuca.h"
 unsigned int MIN_BANKSIZE=65536;
 #define FIXED_OVERHEAD 55e-12 /* clock skew and jitter in s. Ref: Hrishikesh et al ISCA 01 */
 #define LATCH_DELAY 28e-12 /* latch delay in s (later should use FO4 TODO) */
 #define CONTR_2_BANK_LAT 0
 int cont_stats[2 /*l2 or l3*/][5/* cores */][ROUTER_TYPES][7 /*banks*/][8 /* cycle time */];
  Nuca::Nuca(
      TechnologyParameter::DeviceType *dt = &(g_tp.peri_global)
      ):deviceType(dt)
 {
  init_cont();
 }
 void
 Nuca::init_cont()
 {
  FILE *cont;
  char line[5000];
  char jk[5000];
  cont = fopen("contention.dat", "r");
  if (!cont) {
    cout << "contention.dat file is missing!\n";
    exit(0);
  }
  for(int i=0; i<2; i++) {
    for(int j=2; j<5; j++) {
      for(int k=0; k<ROUTER_TYPES; k++) {
        for(int l=0;l<7; l++) {
          int *temp = cont_stats[i/*l2 or l3*/][j/*core*/][k/*64 or 128 or 256 link bw*/][l /* no banks*/];
          assert(fscanf(cont, "%[^\n]\n", line) != EOF);
          sscanf(line, "%[^:]: %d %d %d %d %d %d %d %d",jk, &temp[0], &temp[1], &temp[2], &temp[3],
              &temp[4], &temp[5], &temp[6], &temp[7]);
        }
      }
    }
  }
  fclose(cont);
 }
  void
 Nuca::print_cont_stats()
 {
  for(int i=0; i<2; i++) {
    for(int j=2; j<5; j++) {
      for(int k=0; k<ROUTER_TYPES; k++) {
        for(int l=0;l<7; l++) {
          for(int m=0;l<7; l++) {
            cout << cont_stats[i][j][k][l][m] << " ";
          }
          cout << endl;
        }
      }
    }
  }
  cout << endl;
 }
 Nuca::~Nuca(){
  for (int i = wt_min; i <= wt_max; i++) {
    delete wire_vertical[i];
    delete wire_horizontal[i];
  }
 }
 /* converts latency (in s) to cycles depending upon the FREQUENCY (in GHz) */
  int
 Nuca::calc_cycles(double lat, double oper_freq)
 {
  //TODO: convert latch delay to FO4 */
  double cycle_time = (1.0/(oper_freq*1e9)); /*s*/
  cycle_time -= LATCH_DELAY;
  cycle_time -= FIXED_OVERHEAD;
  return (int)ceil(lat/cycle_time);
 }
 nuca_org_t::~nuca_org_t() {
  // if(h_wire) delete h_wire;
  // if(v_wire) delete v_wire;
  // if(router) delete router;
 }
 /*
 * Version - 6.0
 *
 * Perform exhaustive search across different bank organizatons,
 * router configurations, grid organizations, and wire models and
 * find an optimal NUCA organization
 * For different bank count values
 * 1. Optimal bank organization is calculated
 * 2. For each bank organization, find different NUCA organizations
 *    using various router configurations, grid organizations,
 *    and wire models.
 * 3. NUCA model with the least cost is picked for
 *    this particular bank count
 * Finally include contention statistics and find the optimal
 *    NUCA configuration
 */
  void
 Nuca::sim_nuca()
 {
  /* temp variables */
  int it, ro, wr;
  int num_cyc;
  unsigned int i, j, k;
  unsigned int r, c;
  int l2_c;
  int bank_count = 0;
  uca_org_t ures;
  nuca_org_t *opt_n;
  mem_array tag, data;
  list<nuca_org_t *> nuca_list;
  Router *router_s[ROUTER_TYPES];
  router_s[0] = new Router(64.0, 8, 4, &(g_tp.peri_global));
  router_s[0]->print_router();
  router_s[1] = new Router(128.0, 8, 4, &(g_tp.peri_global));
  router_s[1]->print_router();
  router_s[2] = new Router(256.0, 8, 4, &(g_tp.peri_global));
  router_s[2]->print_router();
  int core_in; // to store no. of cores
  /* to search diff grid organizations */
  double curr_hop, totno_hops, totno_hhops, totno_vhops, tot_lat,
         curr_acclat;
  double avg_lat, avg_hop, avg_hhop, avg_vhop, avg_dyn_power,
         avg_leakage_power;
  double opt_acclat = INF, opt_avg_lat = INF, opt_tot_lat = INF;
  int opt_rows = 0;
  int opt_columns = 0;
  double opt_totno_hops = 0;
  double opt_avg_hop = 0;
  double opt_dyn_power = 0, opt_leakage_power = 0;
  min_values_t minval;
  int bank_start = 0;
  int flit_width = 0;
  /* vertical and horizontal hop latency values */
  int ver_hop_lat, hor_hop_lat; /* in cycles */
  /* no. of different bank sizes to consider */
  int iterations;
  g_ip->nuca_cache_sz = g_ip->cache_sz;
  nuca_list.push_back(new nuca_org_t());
  if (g_ip->cache_level == 0) l2_c = 1;
  else l2_c = 0;
  if (g_ip->cores <= 4) core_in = 2;
  else if (g_ip->cores <= 8) core_in = 3;
  else if (g_ip->cores <= 16) core_in = 4;
  else {cout << "Number of cores should be <= 16!\n"; exit(0);}
  // set the lower bound to an appropriate value. this depends on cache associativity
  if (g_ip->assoc > 2) {
    i = 2;
    while (i != g_ip->assoc) {
      MIN_BANKSIZE *= 2;
      i *= 2;
    }
  }
  iterations = (int)logtwo((int)g_ip->cache_sz/MIN_BANKSIZE);
  if (g_ip->force_wiretype)
  {
    if (g_ip->wt == Low_swing) {
      wt_min = Low_swing;
      wt_max = Low_swing;
    }
    else {
      wt_min = Global;
      wt_max = Low_swing-1;
    }
  }
  else {
    wt_min = Global;
    wt_max = Low_swing;
  }
  if (g_ip->nuca_bank_count != 0) { // simulate just one bank
    if (g_ip->nuca_bank_count != 2 && g_ip->nuca_bank_count != 4 &&
        g_ip->nuca_bank_count != 8 && g_ip->nuca_bank_count != 16 &&
        g_ip->nuca_bank_count != 32 && g_ip->nuca_bank_count != 64) {
      fprintf(stderr,"Incorrect bank count value! Please fix the value in cache.cfg\n");
    }
    bank_start = (int)logtwo((double)g_ip->nuca_bank_count);
    iterations = bank_start+1;
    g_ip->cache_sz = g_ip->cache_sz/g_ip->nuca_bank_count;
  }
  cout << "Simulating various NUCA configurations\n";
  for (it=bank_start; it<iterations; it++) { /* different bank count values */
    ures.tag_array2 = &tag;
    ures.data_array2 = &data;
    /*
     * find the optimal bank organization
     */
    solve(&ures);
 //    output_UCA(&ures);
    bank_count = g_ip->nuca_cache_sz/g_ip->cache_sz;
    cout << "====" <<  g_ip->cache_sz << "\n";
    for (wr=wt_min; wr<=wt_max; wr++) {
      for (ro=0; ro<ROUTER_TYPES; ro++)
      {
        flit_width = (int) router_s[ro]->flit_size; //initialize router
        nuca_list.back()->nuca_pda.cycle_time = router_s[ro]->cycle_time;
        /* calculate router and wire parameters */
        double vlength = ures.cache_ht; /* length of the wire (u)*/
        double hlength = ures.cache_len; // u
        /* find delay, area, and power for wires */
        wire_vertical[wr] = new Wire((enum Wire_type) wr, vlength);
        wire_horizontal[wr] = new Wire((enum Wire_type) wr, hlength);
        hor_hop_lat = calc_cycles(wire_horizontal[wr]->delay,
            1/(nuca_list.back()->nuca_pda.cycle_time*.001));
        ver_hop_lat = calc_cycles(wire_vertical[wr]->delay,
            1/(nuca_list.back()->nuca_pda.cycle_time*.001));
        /*
         * assume a grid like topology and explore for optimal network
         * configuration using different row and column count values.
         */
        for (c=1; c<=(unsigned int)bank_count; c++) {
          while (bank_count%c != 0) c++;
          r = bank_count/c;
          /*
           * to find the avg access latency of a NUCA cache, uncontended
           * access time to each bank from the
           * cache controller is calculated.
           * avg latency =
           * sum of the access latencies to individual banks)/bank
           * count value.
           */
          totno_hops = totno_hhops = totno_vhops = tot_lat = 0;
          k = 1;
          for (i=0; i<r; i++) {
            for (j=0; j<c; j++) {
              /*
               * vertical hops including the
               * first hop from the cache controller
               */
              curr_hop = i + 1;
              curr_hop += j; /* horizontal hops */
              totno_hhops += j;
              totno_vhops += (i+1);
              curr_acclat = (i * ver_hop_lat + CONTR_2_BANK_LAT +
                  j * hor_hop_lat);
              tot_lat += curr_acclat;
              totno_hops += curr_hop;
            }
          }
          avg_lat = tot_lat/bank_count;
          avg_hop = totno_hops/bank_count;
          avg_hhop = totno_hhops/bank_count;
          avg_vhop = totno_vhops/bank_count;
          /* net access latency */
          curr_acclat = 2*avg_lat + 2*(router_s[ro]->delay*avg_hop) +
            calc_cycles(ures.access_time,
                1/(nuca_list.back()->nuca_pda.cycle_time*.001));
          /* avg access lat of nuca */
          avg_dyn_power =
            avg_hop *
            (router_s[ro]->power.readOp.dynamic) + avg_hhop *
            (wire_horizontal[wr]->power.readOp.dynamic) *
            (g_ip->block_sz*8 + 64) + avg_vhop *
            (wire_vertical[wr]->power.readOp.dynamic) *
            (g_ip->block_sz*8 + 64) + ures.power.readOp.dynamic;
          avg_leakage_power =
            bank_count * router_s[ro]->power.readOp.leakage +
            avg_hhop * (wire_horizontal[wr]->power.readOp.leakage*
                wire_horizontal[wr]->delay) * flit_width +
            avg_vhop * (wire_vertical[wr]->power.readOp.leakage *
                wire_horizontal[wr]->delay);
          if (curr_acclat < opt_acclat) {
            opt_acclat = curr_acclat;
            opt_tot_lat = tot_lat;
            opt_avg_lat = avg_lat;
            opt_totno_hops = totno_hops;
            opt_avg_hop = avg_hop;
            opt_rows = r;
            opt_columns = c;
            opt_dyn_power = avg_dyn_power;
            opt_leakage_power = avg_leakage_power;
          }
          totno_hops = 0;
          tot_lat = 0;
          totno_hhops = 0;
          totno_vhops = 0;
        }
        nuca_list.back()->wire_pda.power.readOp.dynamic =
          opt_avg_hop * flit_width *
          (wire_horizontal[wr]->power.readOp.dynamic +
           wire_vertical[wr]->power.readOp.dynamic);
        nuca_list.back()->avg_hops = opt_avg_hop;
        /* network delay/power */
        nuca_list.back()->h_wire = wire_horizontal[wr];
        nuca_list.back()->v_wire = wire_vertical[wr];
        nuca_list.back()->router = router_s[ro];
        /* bank delay/power */
        nuca_list.back()->bank_pda.delay = ures.access_time;
        nuca_list.back()->bank_pda.power = ures.power;
        nuca_list.back()->bank_pda.area.h = ures.cache_ht;
        nuca_list.back()->bank_pda.area.w = ures.cache_len;
        nuca_list.back()->bank_pda.cycle_time = ures.cycle_time;
        num_cyc = calc_cycles(nuca_list.back()->bank_pda.delay /*s*/,
            1/(nuca_list.back()->nuca_pda.cycle_time*.001/*GHz*/));
        if(num_cyc%2 != 0) num_cyc++;
        if (num_cyc > 16) num_cyc = 16; // we have data only up to 16 cycles
        if (it < 7) {
          nuca_list.back()->nuca_pda.delay = opt_acclat +
            cont_stats[l2_c][core_in][ro][it][num_cyc/2-1];
          nuca_list.back()->contention =
            cont_stats[l2_c][core_in][ro][it][num_cyc/2-1];
        }
        else {
          nuca_list.back()->nuca_pda.delay = opt_acclat +
            cont_stats[l2_c][core_in][ro][7][num_cyc/2-1];
          nuca_list.back()->contention =
            cont_stats[l2_c][core_in][ro][7][num_cyc/2-1];
        }
        nuca_list.back()->nuca_pda.power.readOp.dynamic = opt_dyn_power;
        nuca_list.back()->nuca_pda.power.readOp.leakage = opt_leakage_power;
        /* array organization */
        nuca_list.back()->bank_count = bank_count;
        nuca_list.back()->rows = opt_rows;
        nuca_list.back()->columns = opt_columns;
        calculate_nuca_area (nuca_list.back());
        minval.update_min_values(nuca_list.back());
        nuca_list.push_back(new nuca_org_t());
        opt_acclat = BIGNUM;
      }
    }
    g_ip->cache_sz /= 2;
  }
  delete(nuca_list.back());
  nuca_list.pop_back();
  opt_n = find_optimal_nuca(&nuca_list, &minval);
  print_nuca(opt_n);
  g_ip->cache_sz = g_ip->nuca_cache_sz/opt_n->bank_count;
  list<nuca_org_t *>::iterator niter;
  for (niter = nuca_list.begin(); niter != nuca_list.end(); ++niter)
  {
    delete *niter;
  }
  nuca_list.clear();
  for(int i=0; i < ROUTER_TYPES; i++)
  {
    delete router_s[i];
  }
  g_ip->display_ip();
  //  g_ip->force_cache_config = true;
  //  g_ip->ndwl = 8;
  //  g_ip->ndbl = 16;
  //  g_ip->nspd = 4;
  //  g_ip->ndcm = 1;
  //  g_ip->ndsam1 = 8;
  //  g_ip->ndsam2 = 32;
 }
  void
 Nuca::print_nuca (nuca_org_t *fr)
 {
  printf("\n---------- CACTI version 6.5, Non-uniform Cache Access "
      "----------\n\n");
  printf("Optimal number of banks - %d\n", fr->bank_count);
  printf("Grid organization rows x columns - %d x %d\n",
      fr->rows, fr->columns);
  printf("Network frequency - %g GHz\n",
      (1/fr->nuca_pda.cycle_time)*1e3);
  printf("Cache dimension (mm x mm) - %g x %g\n",
      fr->nuca_pda.area.h,
      fr->nuca_pda.area.w);
  fr->router->print_router();
  printf("\n\nWire stats:\n");
  if (fr->h_wire->wt == Global) {
    printf("\tWire type - Full swing global wires with least "
        "possible delay\n");
  }
  else if (fr->h_wire->wt == Global_5) {
    printf("\tWire type - Full swing global wires with "
        "5%% delay penalty\n");
  }
  else if (fr->h_wire->wt == Global_10) {
    printf("\tWire type - Full swing global wires with "
        "10%% delay penalty\n");
  }
  else if (fr->h_wire->wt == Global_20) {
    printf("\tWire type - Full swing global wires with "
        "20%% delay penalty\n");
  }
  else if (fr->h_wire->wt == Global_30) {
    printf("\tWire type - Full swing global wires with "
        "30%% delay penalty\n");
  }
  else if(fr->h_wire->wt == Low_swing) {
    printf("\tWire type - Low swing wires\n");
  }
  printf("\tHorizontal link delay - %g (ns)\n",
      fr->h_wire->delay*1e9);
  printf("\tVertical link delay - %g (ns)\n",
      fr->v_wire->delay*1e9);
  printf("\tDelay/length - %g (ns/mm)\n",
      fr->h_wire->delay*1e9/fr->bank_pda.area.w);
  printf("\tHorizontal link energy -dynamic/access %g (nJ)\n"
      "\t                       -leakage %g (nW)\n\n",
      fr->h_wire->power.readOp.dynamic*1e9,
      fr->h_wire->power.readOp.leakage*1e9);
  printf("\tVertical link energy -dynamic/access %g (nJ)\n"
      "\t                     -leakage %g (nW)\n\n",
      fr->v_wire->power.readOp.dynamic*1e9,
      fr->v_wire->power.readOp.leakage*1e9);
  printf("\n\n");
  fr->v_wire->print_wire();
  printf("\n\nBank stats:\n");
 }
  nuca_org_t *
 Nuca::find_optimal_nuca (list<nuca_org_t *> *n, min_values_t *minval)
 {
  double cost = 0;
  double min_cost = BIGNUM;
  nuca_org_t *res = NULL;
  float d, a, dp, lp, c;
  int v;
  dp = g_ip->dynamic_power_wt_nuca;
  lp = g_ip->leakage_power_wt_nuca;
  a = g_ip->area_wt_nuca;
  d = g_ip->delay_wt_nuca;
  c = g_ip->cycle_time_wt_nuca;
  list<nuca_org_t *>::iterator niter;
  for (niter = n->begin(); niter != n->end(); niter++) {
    fprintf(stderr, "\n-----------------------------"
        "---------------\n");
    printf("NUCA___stats %d \tbankcount: lat = %g \tdynP = %g \twt = %d\t "
        "bank_dpower = %g \tleak = %g \tcycle = %g\n",
        (*niter)->bank_count,
        (*niter)->nuca_pda.delay,
        (*niter)->nuca_pda.power.readOp.dynamic,
        (*niter)->h_wire->wt,
        (*niter)->bank_pda.power.readOp.dynamic,
        (*niter)->nuca_pda.power.readOp.leakage,
        (*niter)->nuca_pda.cycle_time);
    if (g_ip->ed == 1) {
      cost = ((*niter)->nuca_pda.delay/minval->min_delay)*
        ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn);
      if (min_cost > cost) {
        min_cost = cost;
        res = ((*niter));
      }
    }
    else if (g_ip->ed == 2) {
      cost = ((*niter)->nuca_pda.delay/minval->min_delay)*
        ((*niter)->nuca_pda.delay/minval->min_delay)*
        ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn);
      if (min_cost > cost) {
        min_cost = cost;
        res = ((*niter));
      }
    }
    else {
      /*
       * check whether the current organization
       * meets the input deviation constraints
       */
      v = check_nuca_org((*niter), minval);
      if (minval->min_leakage == 0) minval->min_leakage = 0.1; //FIXME remove this after leakage modeling
      if (v) {
        cost = (d  * ((*niter)->nuca_pda.delay/minval->min_delay) +
            c  * ((*niter)->nuca_pda.cycle_time/minval->min_cyc) +
            dp * ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn) +
            lp * ((*niter)->nuca_pda.power.readOp.leakage/minval->min_leakage) +
            a  * ((*niter)->nuca_pda.area.get_area()/minval->min_area));
        fprintf(stderr, "cost = %g\n", cost);
        if (min_cost > cost) {
          min_cost = cost;
          res = ((*niter));
        }
      }
      else {
        niter = n->erase(niter);
        if (niter !=n->begin())
                niter --;
      }
    }
  }
  return res;
 }
  int
 Nuca::check_nuca_org (nuca_org_t *n, min_values_t *minval)
 {
  if (((n->nuca_pda.delay - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev_nuca) {
    return 0;
  }
  if (((n->nuca_pda.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 >
      g_ip->dynamic_power_dev_nuca) {
    return 0;
  }
  if (((n->nuca_pda.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 >
      g_ip->leakage_power_dev_nuca) {
    return 0;
  }
  if (((n->nuca_pda.cycle_time - minval->min_cyc)/minval->min_cyc)*100 >
      g_ip->cycle_time_dev_nuca) {
    return 0;
  }
  if (((n->nuca_pda.area.get_area() - minval->min_area)/minval->min_area)*100 >
      g_ip->area_dev_nuca) {
    return 0;
  }
  return 1;
 }
  void
 Nuca::calculate_nuca_area (nuca_org_t *nuca)
 {
  nuca->nuca_pda.area.h=
    nuca->rows * ((nuca->h_wire->wire_width +
          nuca->h_wire->wire_spacing)
        * nuca->router->flit_size +
        nuca->bank_pda.area.h);
  nuca->nuca_pda.area.w =
    nuca->columns * ((nuca->v_wire->wire_width +
          nuca->v_wire->wire_spacing)
        * nuca->router->flit_size +
        nuca->bank_pda.area.w);
 }
--- a/ext/mcpat/cacti/nuca.h
+++ b/ext/mcpat/cacti/nuca.h
@ -0,0 +1,100 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __NUCA_H__
 #define __NUCA_H__
 #include <iostream>
 #include "assert.h"
 #include "basic_circuit.h"
 #include "cacti_interface.h"
 #include "component.h"
 #include "io.h"
 #include "mat.h"
 #include "parameter.h"
 #include "router.h"
 #include "wire.h"
 class nuca_org_t {
  public:
  ~nuca_org_t();
 //    int size;
    /* area, power, access time, and cycle time stats */
    Component nuca_pda;
    Component bank_pda;
    Component wire_pda;
    Wire *h_wire;
    Wire *v_wire;
    Router *router;
    /* for particular network configuration
     * calculated based on a cycle accurate
     * simulation Ref: CACTI 6 - Tech report
     */
    double contention;
    /* grid network stats */
    double avg_hops;
    int rows;
    int columns;
    int bank_count;
 };
 class Nuca : public Component
 {
  public:
    Nuca(
        TechnologyParameter::DeviceType *dt);
    void print_router();
    ~Nuca();
    void sim_nuca();
    void init_cont();
    int calc_cycles(double lat, double oper_freq);
    void calculate_nuca_area (nuca_org_t *nuca);
    int check_nuca_org (nuca_org_t *n, min_values_t *minval);
    nuca_org_t * find_optimal_nuca (list<nuca_org_t *> *n, min_values_t *minval);
    void print_nuca(nuca_org_t *n);
    void print_cont_stats();
  private:
    TechnologyParameter::DeviceType *deviceType;
    int wt_min, wt_max;
    Wire *wire_vertical[WIRE_TYPES],
         *wire_horizontal[WIRE_TYPES];
 };
 #endif
--- a/ext/mcpat/cacti/parameter.cc
+++ b/ext/mcpat/cacti/parameter.cc
@ -0,0 +1,713 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <iomanip>
 #include <iostream>
 #include <string>
 #include "area.h"
 #include "parameter.h"
 using namespace std;
 InputParameter * g_ip;
 TechnologyParameter g_tp;
 void TechnologyParameter::DeviceType::display(uint32_t indent)
 {
  string indent_str(indent, ' ');
  cout << indent_str << "C_g_ideal = " << setw(12) << C_g_ideal << " F/um" << endl;
  cout << indent_str << "C_fringe  = " << setw(12) << C_fringe  << " F/um" << endl;
  cout << indent_str << "C_overlap = " << setw(12) << C_overlap << " F/um" << endl;
  cout << indent_str << "C_junc    = " << setw(12) << C_junc    << " F/um^2" << endl;
  cout << indent_str << "l_phy     = " << setw(12) << l_phy     << " um" << endl;
  cout << indent_str << "l_elec    = " << setw(12) << l_elec    << " um" << endl;
  cout << indent_str << "R_nch_on  = " << setw(12) << R_nch_on  << " ohm-um" << endl;
  cout << indent_str << "R_pch_on  = " << setw(12) << R_pch_on  << " ohm-um" << endl;
  cout << indent_str << "Vdd       = " << setw(12) << Vdd       << " V" << endl;
  cout << indent_str << "Vth       = " << setw(12) << Vth       << " V" << endl;
  cout << indent_str << "I_on_n    = " << setw(12) << I_on_n    << " A/um" << endl;
  cout << indent_str << "I_on_p    = " << setw(12) << I_on_p    << " A/um" << endl;
  cout << indent_str << "I_off_n   = " << setw(12) << I_off_n   << " A/um" << endl;
  cout << indent_str << "I_off_p   = " << setw(12) << I_off_p   << " A/um" << endl;
  cout << indent_str << "C_ox      = " << setw(12) << C_ox      << " F/um^2" << endl;
  cout << indent_str << "t_ox      = " << setw(12) << t_ox      << " um" << endl;
  cout << indent_str << "n_to_p_eff_curr_drv_ratio = " << n_to_p_eff_curr_drv_ratio << endl;
 }
 void TechnologyParameter::InterconnectType::display(uint32_t indent)
 {
  string indent_str(indent, ' ');
  cout << indent_str << "pitch    = " << setw(12) << pitch    << " um" << endl;
  cout << indent_str << "R_per_um = " << setw(12) << R_per_um << " ohm/um" << endl;
  cout << indent_str << "C_per_um = " << setw(12) << C_per_um << " F/um" << endl;
 }
 void TechnologyParameter::ScalingFactor::display(uint32_t indent)
 {
  string indent_str(indent, ' ');
  cout << indent_str << "logic_scaling_co_eff    = " << setw(12) << logic_scaling_co_eff << endl;
  cout << indent_str << "curr_core_tx_density = " << setw(12) << core_tx_density << " # of tx/um^2" << endl;
 }
 void TechnologyParameter::MemoryType::display(uint32_t indent)
 {
  string indent_str(indent, ' ');
  cout << indent_str << "b_w         = " << setw(12) << b_w << " um" << endl;
  cout << indent_str << "b_h         = " << setw(12) << b_h << " um" << endl;
  cout << indent_str << "cell_a_w    = " << setw(12) << cell_a_w << " um" << endl;
  cout << indent_str << "cell_pmos_w = " << setw(12) << cell_pmos_w << " um" << endl;
  cout << indent_str << "cell_nmos_w = " << setw(12) << cell_nmos_w << " um" << endl;
  cout << indent_str << "Vbitpre     = " << setw(12) << Vbitpre << " V" << endl;
 }
 void TechnologyParameter::display(uint32_t indent)
 {
  string indent_str(indent, ' ');
  cout << indent_str << "ram_wl_stitching_overhead_ = " << setw(12) << ram_wl_stitching_overhead_ << " um" << endl;
  cout << indent_str << "min_w_nmos_                = " << setw(12) << min_w_nmos_                << " um" << endl;
  cout << indent_str << "max_w_nmos_                = " << setw(12) << max_w_nmos_                << " um" << endl;
  cout << indent_str << "unit_len_wire_del          = " << setw(12) << unit_len_wire_del          << " s/um^2" << endl;
  cout << indent_str << "FO4                        = " << setw(12) << FO4                        << " s" << endl;
  cout << indent_str << "kinv                       = " << setw(12) << kinv                       << " s" << endl;
  cout << indent_str << "vpp                        = " << setw(12) << vpp                        << " V" << endl;
  cout << indent_str << "w_sense_en                 = " << setw(12) << w_sense_en                 << " um" << endl;
  cout << indent_str << "w_sense_n                  = " << setw(12) << w_sense_n                  << " um" << endl;
  cout << indent_str << "w_sense_p                  = " << setw(12) << w_sense_p                  << " um" << endl;
  cout << indent_str << "w_iso                      = " << setw(12) << w_iso                      << " um" << endl;
  cout << indent_str << "w_poly_contact             = " << setw(12) << w_poly_contact             << " um" << endl;
  cout << indent_str << "spacing_poly_to_poly       = " << setw(12) << spacing_poly_to_poly       << " um" << endl;
  cout << indent_str << "spacing_poly_to_contact    = " << setw(12) << spacing_poly_to_contact    << " um" << endl;
  cout << endl;
  cout << indent_str << "w_comp_inv_p1              = " << setw(12) << w_comp_inv_p1 << " um" << endl;
  cout << indent_str << "w_comp_inv_p2              = " << setw(12) << w_comp_inv_p2 << " um" << endl;
  cout << indent_str << "w_comp_inv_p3              = " << setw(12) << w_comp_inv_p3 << " um" << endl;
  cout << indent_str << "w_comp_inv_n1              = " << setw(12) << w_comp_inv_n1 << " um" << endl;
  cout << indent_str << "w_comp_inv_n2              = " << setw(12) << w_comp_inv_n2 << " um" << endl;
  cout << indent_str << "w_comp_inv_n3              = " << setw(12) << w_comp_inv_n3 << " um" << endl;
  cout << indent_str << "w_eval_inv_p               = " << setw(12) << w_eval_inv_p  << " um" << endl;
  cout << indent_str << "w_eval_inv_n               = " << setw(12) << w_eval_inv_n  << " um" << endl;
  cout << indent_str << "w_comp_n                   = " << setw(12) << w_comp_n      << " um" << endl;
  cout << indent_str << "w_comp_p                   = " << setw(12) << w_comp_p      << " um" << endl;
  cout << endl;
  cout << indent_str << "dram_cell_I_on             = " << setw(12) << dram_cell_I_on << " A/um" << endl;
  cout << indent_str << "dram_cell_Vdd              = " << setw(12) << dram_cell_Vdd  << " V" << endl;
  cout << indent_str << "dram_cell_I_off_worst_case_len_temp = " << setw(12) << dram_cell_I_off_worst_case_len_temp << " A/um" << endl;
  cout << indent_str << "dram_cell_C                = " << setw(12) << dram_cell_C               << " F" << endl;
  cout << indent_str << "gm_sense_amp_latch         = " << setw(12) << gm_sense_amp_latch        << " F/s" << endl;
  cout << endl;
  cout << indent_str << "w_nmos_b_mux               = " << setw(12) << w_nmos_b_mux              << " um" << endl;
  cout << indent_str << "w_nmos_sa_mux              = " << setw(12) << w_nmos_sa_mux             << " um" << endl;
  cout << indent_str << "w_pmos_bl_precharge        = " << setw(12) << w_pmos_bl_precharge       << " um" << endl;
  cout << indent_str << "w_pmos_bl_eq               = " << setw(12) << w_pmos_bl_eq              << " um" << endl;
  cout << indent_str << "MIN_GAP_BET_P_AND_N_DIFFS  = " << setw(12) << MIN_GAP_BET_P_AND_N_DIFFS << " um" << endl;
  cout << indent_str << "HPOWERRAIL                 = " << setw(12) << HPOWERRAIL                << " um" << endl;
  cout << indent_str << "cell_h_def                 = " << setw(12) << cell_h_def                << " um" << endl;
  cout << endl;
  cout << indent_str << "SRAM cell transistor: " << endl;
  sram_cell.display(indent + 2);
  cout << endl;
  cout << indent_str << "DRAM access transistor: " << endl;
  dram_acc.display(indent + 2);
  cout << endl;
  cout << indent_str << "DRAM wordline transistor: " << endl;
  dram_wl.display(indent + 2);
  cout << endl;
  cout << indent_str << "peripheral global transistor: " << endl;
  peri_global.display(indent + 2);
  cout << endl;
  cout << indent_str << "wire local" << endl;
  wire_local.display(indent + 2);
  cout << endl;
  cout << indent_str << "wire inside mat" << endl;
  wire_inside_mat.display(indent + 2);
  cout << endl;
  cout << indent_str << "wire outside mat" << endl;
  wire_outside_mat.display(indent + 2);
  cout << endl;
  cout << indent_str << "SRAM" << endl;
  sram.display(indent + 2);
  cout << endl;
  cout << indent_str << "DRAM" << endl;
  dram.display(indent + 2);
 }
 DynamicParameter::DynamicParameter():
  use_inp_params(0), cell(), is_valid(true)
 {
 }
 DynamicParameter::DynamicParameter(
    bool is_tag_,
    int pure_ram_,
    int pure_cam_,
    double Nspd_,
    unsigned int Ndwl_,
    unsigned int Ndbl_,
    unsigned int Ndcm_,
    unsigned int Ndsam_lev_1_,
    unsigned int Ndsam_lev_2_,
    bool is_main_mem_):
  is_tag(is_tag_), pure_ram(pure_ram_), pure_cam(pure_cam_), tagbits(0), Nspd(Nspd_), Ndwl(Ndwl_), Ndbl(Ndbl_),Ndcm(Ndcm_),
  Ndsam_lev_1(Ndsam_lev_1_), Ndsam_lev_2(Ndsam_lev_2_),
  number_way_select_signals_mat(0), V_b_sense(0), use_inp_params(0),
  is_main_mem(is_main_mem_), cell(), is_valid(false)
 {
  ram_cell_tech_type = (is_tag) ? g_ip->tag_arr_ram_cell_tech_type : g_ip->data_arr_ram_cell_tech_type;
  is_dram            = ((ram_cell_tech_type == lp_dram) || (ram_cell_tech_type == comm_dram));
  unsigned int capacity_per_die = g_ip->cache_sz / NUMBER_STACKED_DIE_LAYERS;  // capacity per stacked die layer
  const TechnologyParameter::InterconnectType & wire_local = g_tp.wire_local;
  fully_assoc = (g_ip->fully_assoc) ? true : false;
  if (fully_assoc || pure_cam)
  { // fully-assocative cache -- ref: CACTi 2.0 report
          if (Ndwl != 1 ||            //Ndwl is fixed to 1 for FA
                          Ndcm != 1 ||            //Ndcm is fixed to 1 for FA
                          Nspd < 1 || Nspd > 1 || //Nspd is fixed to 1 for FA
                          Ndsam_lev_1 != 1 ||     //Ndsam_lev_1 is fixed to one
                          Ndsam_lev_2 != 1 ||     //Ndsam_lev_2 is fixed to one
                          Ndbl < 2)
          {
          return;
          }
  }
  if ((is_dram) && (!is_tag) && (Ndcm > 1))
  {
          return;  // For a DRAM array, each bitline has its own sense-amp
  }
  // If it's not an FA tag/data array, Ndwl should be at least two and Ndbl should be
  // at least two because an array is assumed to have at least one mat. And a mat
  // is formed out of two horizontal subarrays and two vertical subarrays
  if (fully_assoc == false && (Ndwl < 1 || Ndbl < 1))
  {
          return;
  }
  //***********compute row, col of an subarray
  if (!(fully_assoc || pure_cam))//Not fully_asso nor cam
  {
          // if data array, let tagbits = 0
          if (is_tag)
          {
                  if (g_ip->specific_tag)
                  {
                          tagbits = g_ip->tag_w;
                  }
                  else
                  {
                          tagbits = ADDRESS_BITS + EXTRA_TAG_BITS - _log2(capacity_per_die) +
                          _log2(g_ip->tag_assoc*2 - 1) - _log2(g_ip->nbanks);
                  }
                  tagbits = (((tagbits + 3) >> 2) << 2);
                  num_r_subarray = (int)ceil(capacity_per_die / (g_ip->nbanks *
                                  g_ip->block_sz * g_ip->tag_assoc * Ndbl * Nspd));// + EPSILON);
                  num_c_subarray = (int)ceil((tagbits * g_ip->tag_assoc * Nspd / Ndwl));// + EPSILON);
                  //burst_length = 1;
          }
          else
          {
                  num_r_subarray = (int)ceil(capacity_per_die / (g_ip->nbanks *
                                  g_ip->block_sz * g_ip->data_assoc * Ndbl * Nspd));// + EPSILON);
                  num_c_subarray = (int)ceil((8 * g_ip->block_sz * g_ip->data_assoc * Nspd / Ndwl));// + EPSILON); + EPSILON);
                  // burst_length = g_ip->block_sz * 8 / g_ip->out_w;
          }
          if (num_r_subarray < MINSUBARRAYROWS) return;
          if (num_r_subarray == 0) return;
          if (num_r_subarray > MAXSUBARRAYROWS) return;
          if (num_c_subarray < MINSUBARRAYCOLS) return;
          if (num_c_subarray > MAXSUBARRAYCOLS) return;
  }
  else
  {//either fully-asso or cam
          if (pure_cam)
          {
                  if (g_ip->specific_tag)
                  {
                          tagbits = int(ceil(g_ip->tag_w/8.0)*8);
                  }
                  else
                  {
                          tagbits = int(ceil((ADDRESS_BITS + EXTRA_TAG_BITS)/8.0)*8);
 //			  cout<<"Pure CAM needs tag width to be specified"<<endl;
 //			  exit(0);
                  }
                  //tagbits = (((tagbits + 3) >> 2) << 2);
                  tag_num_r_subarray = (int)ceil(capacity_per_die / (g_ip->nbanks*tagbits/8.0 * Ndbl));//TODO: error check input of tagbits and blocksize //TODO: for pure CAM, g_ip->block should be number of entries.
                  //tag_num_c_subarray = (int)(tagbits  + EPSILON);
                  tag_num_c_subarray = tagbits;
                  if (tag_num_r_subarray == 0) return;
                  if (tag_num_r_subarray > MAXSUBARRAYROWS) return;
                  if (tag_num_c_subarray < MINSUBARRAYCOLS) return;
                  if (tag_num_c_subarray > MAXSUBARRAYCOLS) return;
                  num_r_subarray = tag_num_r_subarray;
          }
          else //fully associative
          {
                  if (g_ip->specific_tag)
                  {
                          tagbits = g_ip->tag_w;
                  }
                  else
                  {
                          tagbits = ADDRESS_BITS + EXTRA_TAG_BITS - _log2(g_ip->block_sz);//TODO: should be the page_offset=log2(page size), but this info is not avail with CACTI, for McPAT this is no problem.
                  }
                  tagbits = (((tagbits + 3) >> 2) << 2);
                  tag_num_r_subarray = (int)(capacity_per_die / (g_ip->nbanks*g_ip->block_sz * Ndbl));
                  tag_num_c_subarray = (int)ceil((tagbits * Nspd / Ndwl));// + EPSILON);
                  if (tag_num_r_subarray == 0) return;
                  if (tag_num_r_subarray > MAXSUBARRAYROWS) return;
                  if (tag_num_c_subarray < MINSUBARRAYCOLS) return;
                  if (tag_num_c_subarray > MAXSUBARRAYCOLS) return;
                  data_num_r_subarray = tag_num_r_subarray;
                  data_num_c_subarray = 8 * g_ip->block_sz;
                  if (data_num_r_subarray == 0) return;
                  if (data_num_r_subarray > MAXSUBARRAYROWS) return;
                  if (data_num_c_subarray < MINSUBARRAYCOLS) return;
                  if (data_num_c_subarray > MAXSUBARRAYCOLS) return;
                  num_r_subarray = tag_num_r_subarray;
          }
  }
  num_subarrays = Ndwl * Ndbl;
  //****************end of computation of row, col of an subarray
  // calculate wire parameters
  if (fully_assoc || pure_cam)
  {
          cam_cell.h = g_tp.cam.b_h + 2 * wire_local.pitch * (g_ip->num_rw_ports-1 + g_ip->num_rd_ports + g_ip->num_wr_ports)
          + 2 * wire_local.pitch*(g_ip->num_search_ports-1) + wire_local.pitch * g_ip->num_se_rd_ports;
          cam_cell.w = g_tp.cam.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports-1 + g_ip->num_rd_ports + g_ip->num_wr_ports)
          + 2 * wire_local.pitch*(g_ip->num_search_ports-1) + wire_local.pitch * g_ip->num_se_rd_ports;
          cell.h = g_tp.sram.b_h + 2 * wire_local.pitch * (g_ip->num_wr_ports +g_ip->num_rw_ports-1 + g_ip->num_rd_ports)
          + 2 * wire_local.pitch*(g_ip->num_search_ports-1);
          cell.w = g_tp.sram.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports -1 + (g_ip->num_rd_ports - g_ip->num_se_rd_ports)
                          + g_ip->num_wr_ports) + g_tp.wire_local.pitch * g_ip->num_se_rd_ports + 2 * wire_local.pitch*(g_ip->num_search_ports-1);
  }
  else
  {
          if(is_tag)
          {
                  cell.h = g_tp.sram.b_h + 2 * wire_local.pitch * (g_ip->num_rw_ports - 1 + g_ip->num_rd_ports +
                                  g_ip->num_wr_ports);
                  cell.w = g_tp.sram.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports - 1 + g_ip->num_wr_ports +
                                  (g_ip->num_rd_ports - g_ip->num_se_rd_ports)) +
                                  wire_local.pitch * g_ip->num_se_rd_ports;
          }
          else
          {
                  if (is_dram)
                  {
                          cell.h = g_tp.dram.b_h;
                          cell.w = g_tp.dram.b_w;
                  }
                  else
                  {
                          cell.h = g_tp.sram.b_h + 2 * wire_local.pitch * (g_ip->num_wr_ports +
                                          g_ip->num_rw_ports - 1 + g_ip->num_rd_ports);
                          cell.w = g_tp.sram.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports - 1 +
                                          (g_ip->num_rd_ports - g_ip->num_se_rd_ports) +
                                          g_ip->num_wr_ports) + g_tp.wire_local.pitch * g_ip->num_se_rd_ports;
                  }
          }
  }
  double c_b_metal = cell.h * wire_local.C_per_um;
  double C_bl;
  if (!(fully_assoc || pure_cam))
  {
          if (is_dram)
          {
                  deg_bl_muxing = 1;
                  if (ram_cell_tech_type == comm_dram)
                  {
                          C_bl  = num_r_subarray * c_b_metal;
                          V_b_sense = (g_tp.dram_cell_Vdd/2) * g_tp.dram_cell_C / (g_tp.dram_cell_C + C_bl);
                          if (V_b_sense < VBITSENSEMIN)
                          {
                                  return;
                          }
                          V_b_sense = VBITSENSEMIN;  // in any case, we fix sense amp input signal to a constant value
                          dram_refresh_period = 64e-3;
                  }
                  else
                  {
                          double Cbitrow_drain_cap = drain_C_(g_tp.dram.cell_a_w, NCH, 1, 0, cell.w, true, true) / 2.0;
                          C_bl  = num_r_subarray * (Cbitrow_drain_cap + c_b_metal);
                          V_b_sense = (g_tp.dram_cell_Vdd/2) * g_tp.dram_cell_C /(g_tp.dram_cell_C + C_bl);
                          if (V_b_sense < VBITSENSEMIN)
                          {
                                  return; //Sense amp input signal is smaller that minimum allowable sense amp input signal
                          }
                          V_b_sense = VBITSENSEMIN; // in any case, we fix sense amp input signal to a constant value
                          //v_storage_worst = g_tp.dram_cell_Vdd / 2 - VBITSENSEMIN * (g_tp.dram_cell_C + C_bl) / g_tp.dram_cell_C;
                          //dram_refresh_period = 1.1 * g_tp.dram_cell_C * v_storage_worst / g_tp.dram_cell_I_off_worst_case_len_temp;
                          dram_refresh_period = 0.9 * g_tp.dram_cell_C * VDD_STORAGE_LOSS_FRACTION_WORST * g_tp.dram_cell_Vdd / g_tp.dram_cell_I_off_worst_case_len_temp;
                  }
          }
          else
          { //SRAM
                  V_b_sense = (0.05 * g_tp.sram_cell.Vdd > VBITSENSEMIN) ? 0.05 * g_tp.sram_cell.Vdd : VBITSENSEMIN;
                  deg_bl_muxing = Ndcm;
                  // "/ 2.0" below is due to the fact that two adjacent access transistors share drain
                  // contacts in a physical layout
                  double Cbitrow_drain_cap = drain_C_(g_tp.sram.cell_a_w, NCH, 1, 0, cell.w, false, true) / 2.0;
                  C_bl = num_r_subarray * (Cbitrow_drain_cap + c_b_metal);
                  dram_refresh_period = 0;
          }
  }
  else
  {
          c_b_metal = cam_cell.h * wire_local.C_per_um;//IBM and SUN design, SRAM array uses dummy cells to fill the blank space due to mismatch on CAM-RAM
          V_b_sense = (0.05 * g_tp.sram_cell.Vdd > VBITSENSEMIN) ? 0.05 * g_tp.sram_cell.Vdd : VBITSENSEMIN;
          deg_bl_muxing = 1;//FA fix as 1
          // "/ 2.0" below is due to the fact that two adjacent access transistors share drain
          // contacts in a physical layout
          double Cbitrow_drain_cap = drain_C_(g_tp.cam.cell_a_w, NCH, 1, 0, cam_cell.w, false, true) / 2.0;//TODO: comment out these two lines
          C_bl = num_r_subarray * (Cbitrow_drain_cap + c_b_metal);
          dram_refresh_period = 0;
  }
  // do/di: data in/out, for fully associative they are the data width for normal read and write
  // so/si: search data in/out, for fully associative they are the data width for the search ops
  // for CAM, si=di, but so = matching address. do = data out = di (for normal read/write)
  // so/si needs broadcase while do/di do not
  if (fully_assoc || pure_cam)
  {
            switch (Ndbl) {
              case (0):
                cout <<  "   Invalid Ndbl \n"<<endl;
                exit(0);
                break;
              case (1):
                  num_mats_h_dir = 1;//one subarray per mat
                  num_mats_v_dir = 1;
                break;
              case (2):
                  num_mats_h_dir = 1;//two subarrays per mat
                  num_mats_v_dir = 1;
                  break;
              default:
                  num_mats_h_dir = int(floor(sqrt(Ndbl/4.0)));//4 subbarrys per mat
                  num_mats_v_dir = int(Ndbl/4.0 / num_mats_h_dir);
            }
            num_mats = num_mats_h_dir * num_mats_v_dir;
            if (fully_assoc)
            {
                num_so_b_mat   = data_num_c_subarray;
                num_do_b_mat   = data_num_c_subarray + tagbits;
            }
            else
            {
                num_so_b_mat = int(ceil(log2(num_r_subarray)) + ceil(log2(num_subarrays)));//the address contains the matched data
                num_do_b_mat = tagbits;
            }
  }
  else
  {
          num_mats_h_dir = MAX(Ndwl / 2, 1);
          num_mats_v_dir = MAX(Ndbl / 2, 1);
          num_mats       = num_mats_h_dir * num_mats_v_dir;
          num_do_b_mat   = MAX((num_subarrays/num_mats) * num_c_subarray / (deg_bl_muxing * Ndsam_lev_1 * Ndsam_lev_2), 1);
  }
  if (!(fully_assoc|| pure_cam) && (num_do_b_mat < (num_subarrays/num_mats)))
  {
          return;
  }
  int deg_sa_mux_l1_non_assoc;
  //TODO:the i/o for subbank is not necessary and should be removed.
  if (!(fully_assoc || pure_cam))
  {
          if (!is_tag)
          {
                  if (is_main_mem == true)
                  {
                          num_do_b_subbank = g_ip->int_prefetch_w * g_ip->out_w;
                          deg_sa_mux_l1_non_assoc = Ndsam_lev_1;
                  }
                  else
                  {
                          if (g_ip->fast_access == true)
                          {
                                  num_do_b_subbank = g_ip->out_w * g_ip->data_assoc;
                                  deg_sa_mux_l1_non_assoc = Ndsam_lev_1;
                          }
                          else
                          {
                                  num_do_b_subbank = g_ip->out_w;
                                  deg_sa_mux_l1_non_assoc = Ndsam_lev_1 / g_ip->data_assoc;
                                  if (deg_sa_mux_l1_non_assoc < 1)
                                  {
                                          return;
                                  }
                          }
                  }
          }
          else
          {
                  num_do_b_subbank = tagbits * g_ip->tag_assoc;
                  if (num_do_b_mat < tagbits)
                  {
                          return;
                  }
                  deg_sa_mux_l1_non_assoc = Ndsam_lev_1;
                  //num_do_b_mat = g_ip->tag_assoc / num_mats_h_dir;
          }
  }
  else
  {
          if (fully_assoc)
          {
                  num_so_b_subbank = 8 * g_ip->block_sz;//TODO:internal perfetch should be considered also for fa
                  num_do_b_subbank = num_so_b_subbank + tag_num_c_subarray;
          }
          else
          {
                  num_so_b_subbank = int(ceil(log2(num_r_subarray)) + ceil(log2(num_subarrays)));//the address contains the matched data
                  num_do_b_subbank = tag_num_c_subarray;
          }
          deg_sa_mux_l1_non_assoc = 1;
  }
  deg_senseamp_muxing_non_associativity = deg_sa_mux_l1_non_assoc;
  if (fully_assoc || pure_cam)
  {
          num_act_mats_hor_dir = 1;
          num_act_mats_hor_dir_sl = num_mats_h_dir;//TODO: this is unnecessary, since search op, num_mats is used
  }
  else
  {
          num_act_mats_hor_dir = num_do_b_subbank / num_do_b_mat;
          if (num_act_mats_hor_dir == 0)
          {
                  return;
          }
  }
  //compute num_do_mat for tag
  if (is_tag)
  {
          if (!(fully_assoc || pure_cam))
          {
                  num_do_b_mat     = g_ip->tag_assoc / num_act_mats_hor_dir;
                  num_do_b_subbank = num_act_mats_hor_dir * num_do_b_mat;
          }
  }
  if ((g_ip->is_cache == false && is_main_mem == true) || (PAGE_MODE == 1 && is_dram))
  {
          if (num_act_mats_hor_dir * num_do_b_mat * Ndsam_lev_1 * Ndsam_lev_2 != (int)g_ip->page_sz_bits)
          {
                  return;
          }
  }
 //  if (is_tag == false && g_ip->is_cache == true && !fully_assoc && !pure_cam && //TODO: TODO burst transfer should also apply to RAM arrays
  if (is_tag == false && g_ip->is_main_mem == true &&
                  num_act_mats_hor_dir*num_do_b_mat*Ndsam_lev_1*Ndsam_lev_2 < ((int) g_ip->out_w * (int) g_ip->burst_len * (int) g_ip->data_assoc))
  {
          return;
  }
  if (num_act_mats_hor_dir > num_mats_h_dir)
  {
          return;
  }
  //compute di for mat subbank and bank
  if (!(fully_assoc ||pure_cam))
  {
          if(!is_tag)
          {
                  if(g_ip->fast_access == true)
                  {
                          num_di_b_mat = num_do_b_mat / g_ip->data_assoc;
                  }
                  else
                  {
                          num_di_b_mat = num_do_b_mat;
                  }
          }
          else
          {
                  num_di_b_mat = tagbits;
          }
  }
  else
  {
          if (fully_assoc)
          {
                  num_di_b_mat = num_do_b_mat;
                  //*num_subarrays/num_mats; bits per mat of CAM/FA is as same as cache,
                  //but inside the mat wire tracks need to be reserved for search data bus
                  num_si_b_mat = tagbits;
          }
          else
          {
                  num_di_b_mat = tagbits;
                  num_si_b_mat = tagbits;//*num_subarrays/num_mats;
          }
  }
  num_di_b_subbank       = num_di_b_mat * num_act_mats_hor_dir;//normal cache or normal r/w for FA
  num_si_b_subbank       = num_si_b_mat; //* num_act_mats_hor_dir_sl; inside the data is broadcast
  int num_addr_b_row_dec     = _log2(num_r_subarray);
  if  ((fully_assoc ||pure_cam))
          num_addr_b_row_dec     +=_log2(num_subarrays/num_mats);
  int number_subbanks        = num_mats / num_act_mats_hor_dir;
  number_subbanks_decode = _log2(number_subbanks);//TODO: add log2(num_subarray_per_bank) to FA/CAM
  num_rw_ports = g_ip->num_rw_ports;
  num_rd_ports = g_ip->num_rd_ports;
  num_wr_ports = g_ip->num_wr_ports;
  num_se_rd_ports = g_ip->num_se_rd_ports;
  num_search_ports = g_ip->num_search_ports;
  if (is_dram && is_main_mem)
  {
          number_addr_bits_mat = MAX((unsigned int) num_addr_b_row_dec,
                          _log2(deg_bl_muxing) + _log2(deg_sa_mux_l1_non_assoc) + _log2(Ndsam_lev_2));
  }
  else
  {
          number_addr_bits_mat = num_addr_b_row_dec + _log2(deg_bl_muxing) +
          _log2(deg_sa_mux_l1_non_assoc) + _log2(Ndsam_lev_2);
  }
  if (!(fully_assoc ||pure_cam))
  {
          if (is_tag)
          {
                  num_di_b_bank_per_port = tagbits;
                  num_do_b_bank_per_port = g_ip->data_assoc;
          }
          else
          {
                  num_di_b_bank_per_port = g_ip->out_w + g_ip->data_assoc;
                  num_do_b_bank_per_port = g_ip->out_w;
          }
  }
  else
  {
          if (fully_assoc)
          {
                  num_di_b_bank_per_port = g_ip->out_w + tagbits;//TODO: out_w or block_sz?
                  num_si_b_bank_per_port = tagbits;
                  num_do_b_bank_per_port = g_ip->out_w + tagbits;
                  num_so_b_bank_per_port = g_ip->out_w;
          }
          else
          {
                  num_di_b_bank_per_port = tagbits;
                  num_si_b_bank_per_port = tagbits;
                  num_do_b_bank_per_port = tagbits;
                  num_so_b_bank_per_port = int(ceil(log2(num_r_subarray)) + ceil(log2(num_subarrays)));
          }
  }
  if ((!is_tag) && (g_ip->data_assoc > 1) && (!g_ip->fast_access))
  {
          number_way_select_signals_mat = g_ip->data_assoc;
  }
  // add ECC adjustment to all data signals that traverse on H-trees.
  if (g_ip->add_ecc_b_ == true)
  {
          num_do_b_mat += (int) (ceil(num_do_b_mat / num_bits_per_ecc_b_));
          num_di_b_mat += (int) (ceil(num_di_b_mat / num_bits_per_ecc_b_));
          num_di_b_subbank += (int) (ceil(num_di_b_subbank / num_bits_per_ecc_b_));
          num_do_b_subbank += (int) (ceil(num_do_b_subbank / num_bits_per_ecc_b_));
          num_di_b_bank_per_port += (int) (ceil(num_di_b_bank_per_port / num_bits_per_ecc_b_));
          num_do_b_bank_per_port += (int) (ceil(num_do_b_bank_per_port / num_bits_per_ecc_b_));
          num_so_b_mat += (int) (ceil(num_so_b_mat / num_bits_per_ecc_b_));
          num_si_b_mat += (int) (ceil(num_si_b_mat / num_bits_per_ecc_b_));
          num_si_b_subbank += (int) (ceil(num_si_b_subbank / num_bits_per_ecc_b_));
          num_so_b_subbank += (int) (ceil(num_so_b_subbank / num_bits_per_ecc_b_));
          num_si_b_bank_per_port += (int) (ceil(num_si_b_bank_per_port / num_bits_per_ecc_b_));
          num_so_b_bank_per_port += (int) (ceil(num_so_b_bank_per_port / num_bits_per_ecc_b_));
  }
  is_valid = true;
 }
--- a/ext/mcpat/cacti/parameter.h
+++ b/ext/mcpat/cacti/parameter.h
@ -0,0 +1,367 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __PARAMETER_H__
 #define __PARAMETER_H__
 #include "area.h"
 #include "cacti_interface.h"
 #include "const.h"
 #include "io.h"
 // parameters which are functions of certain device technology
 class TechnologyParameter
 {
 public:
  class DeviceType
  {
   public:
    double C_g_ideal;
    double C_fringe;
    double C_overlap;
    double C_junc;  // C_junc_area
    double C_junc_sidewall;
    double l_phy;
    double l_elec;
    double R_nch_on;
    double R_pch_on;
    double Vdd;
    double Vth;
    double I_on_n;
    double I_on_p;
    double I_off_n;
    double I_off_p;
    double I_g_on_n;
    double I_g_on_p;
    double C_ox;
    double t_ox;
    double n_to_p_eff_curr_drv_ratio;
    double long_channel_leakage_reduction;
    DeviceType(): C_g_ideal(0), C_fringe(0), C_overlap(0), C_junc(0),
                  C_junc_sidewall(0), l_phy(0), l_elec(0), R_nch_on(0), R_pch_on(0),
                  Vdd(0), Vth(0),
                  I_on_n(0), I_on_p(0), I_off_n(0), I_off_p(0),I_g_on_n(0),I_g_on_p(0),
                  C_ox(0), t_ox(0), n_to_p_eff_curr_drv_ratio(0), long_channel_leakage_reduction(0) { };
    void reset()
    {
      C_g_ideal = 0;
      C_fringe  = 0;
      C_overlap = 0;
      C_junc    = 0;
      l_phy     = 0;
      l_elec    = 0;
      R_nch_on  = 0;
      R_pch_on  = 0;
      Vdd       = 0;
      Vth       = 0;
      I_on_n    = 0;
      I_on_p    = 0;
      I_off_n   = 0;
      I_off_p   = 0;
      I_g_on_n   = 0;
      I_g_on_p   = 0;
      C_ox      = 0;
      t_ox      = 0;
      n_to_p_eff_curr_drv_ratio = 0;
      long_channel_leakage_reduction = 0;
    }
    void display(uint32_t indent = 0);
  };
  class InterconnectType
  {
   public:
    double pitch;
    double R_per_um;
    double C_per_um;
    double horiz_dielectric_constant;
    double vert_dielectric_constant;
    double aspect_ratio;
    double miller_value;
    double ild_thickness;
    InterconnectType(): pitch(0), R_per_um(0), C_per_um(0) { };
    void reset()
    {
      pitch = 0;
      R_per_um = 0;
      C_per_um = 0;
      horiz_dielectric_constant = 0;
      vert_dielectric_constant = 0;
      aspect_ratio = 0;
      miller_value = 0;
      ild_thickness = 0;
    }
    void display(uint32_t indent = 0);
  };
  class MemoryType
  {
   public:
    double b_w;
    double b_h;
    double cell_a_w;
    double cell_pmos_w;
    double cell_nmos_w;
    double Vbitpre;
    void reset()
    {
      b_w = 0;
      b_h = 0;
      cell_a_w = 0;
      cell_pmos_w = 0;
      cell_nmos_w = 0;
      Vbitpre = 0;
    }
    void display(uint32_t indent = 0);
  };
  class ScalingFactor
  {
   public:
    double logic_scaling_co_eff;
    double core_tx_density;
    double long_channel_leakage_reduction;
    ScalingFactor(): logic_scaling_co_eff(0), core_tx_density(0),
    long_channel_leakage_reduction(0) { };
    void reset()
    {
      logic_scaling_co_eff= 0;
      core_tx_density = 0;
      long_channel_leakage_reduction= 0;
    }
    void display(uint32_t indent = 0);
  };
  double ram_wl_stitching_overhead_;
  double min_w_nmos_;
  double max_w_nmos_;
  double max_w_nmos_dec;
  double unit_len_wire_del;
  double FO4;
  double kinv;
  double vpp;
  double w_sense_en;
  double w_sense_n;
  double w_sense_p;
  double sense_delay;
  double sense_dy_power;
  double w_iso;
  double w_poly_contact;
  double spacing_poly_to_poly;
  double spacing_poly_to_contact;
  double w_comp_inv_p1;
  double w_comp_inv_p2;
  double w_comp_inv_p3;
  double w_comp_inv_n1;
  double w_comp_inv_n2;
  double w_comp_inv_n3;
  double w_eval_inv_p;
  double w_eval_inv_n;
  double w_comp_n;
  double w_comp_p;
  double dram_cell_I_on;
  double dram_cell_Vdd;
  double dram_cell_I_off_worst_case_len_temp;
  double dram_cell_C;
  double gm_sense_amp_latch;
  double w_nmos_b_mux;
  double w_nmos_sa_mux;
  double w_pmos_bl_precharge;
  double w_pmos_bl_eq;
  double MIN_GAP_BET_P_AND_N_DIFFS;
  double MIN_GAP_BET_SAME_TYPE_DIFFS;
  double HPOWERRAIL;
  double cell_h_def;
  double chip_layout_overhead;
  double macro_layout_overhead;
  double sckt_co_eff;
  double fringe_cap;
  uint64_t h_dec;
  DeviceType sram_cell;   // SRAM cell transistor
  DeviceType dram_acc;    // DRAM access transistor
  DeviceType dram_wl;     // DRAM wordline transistor
  DeviceType peri_global; // peripheral global
  DeviceType cam_cell;   // SRAM cell transistor
  InterconnectType wire_local;
  InterconnectType wire_inside_mat;
  InterconnectType wire_outside_mat;
  ScalingFactor scaling_factor;
  MemoryType sram;
  MemoryType dram;
  MemoryType cam;
  void display(uint32_t indent = 0);
  void reset()
  {
    dram_cell_Vdd  = 0;
    dram_cell_I_on = 0;
    dram_cell_C    = 0;
    vpp            = 0;
    sense_delay               = 0;
    sense_dy_power            = 0;
    fringe_cap                = 0;
 //    horiz_dielectric_constant = 0;
 //    vert_dielectric_constant  = 0;
 //    aspect_ratio              = 0;
 //    miller_value              = 0;
 //    ild_thickness             = 0;
    dram_cell_I_off_worst_case_len_temp = 0;
    sram_cell.reset();
    dram_acc.reset();
    dram_wl.reset();
    peri_global.reset();
    cam_cell.reset();
    scaling_factor.reset();
    wire_local.reset();
    wire_inside_mat.reset();
    wire_outside_mat.reset();
    sram.reset();
    dram.reset();
    cam.reset();
    chip_layout_overhead  = 0;
    macro_layout_overhead = 0;
    sckt_co_eff           = 0;
  }
 };
 class DynamicParameter
 {
  public:
    bool is_tag;
    bool pure_ram;
    bool pure_cam;
    bool fully_assoc;
    int tagbits;
    int num_subarrays;  // only for leakage computation  -- the number of subarrays per bank
    int num_mats;       // only for leakage computation  -- the number of mats per bank
    double Nspd;
    int Ndwl;
    int Ndbl;
    int Ndcm;
    int deg_bl_muxing;
    int deg_senseamp_muxing_non_associativity;
    int Ndsam_lev_1;
    int Ndsam_lev_2;
    int number_addr_bits_mat;             // per port
    int number_subbanks_decode;           // per_port
    int num_di_b_bank_per_port;
    int num_do_b_bank_per_port;
    int num_di_b_mat;
    int num_do_b_mat;
    int num_di_b_subbank;
    int num_do_b_subbank;
    int num_si_b_mat;
    int num_so_b_mat;
    int num_si_b_subbank;
    int num_so_b_subbank;
        int num_si_b_bank_per_port;
        int num_so_b_bank_per_port;
    int number_way_select_signals_mat;
    int num_act_mats_hor_dir;
    int num_act_mats_hor_dir_sl;
    bool is_dram;
    double V_b_sense;
    unsigned int num_r_subarray;
    unsigned int num_c_subarray;
    int tag_num_r_subarray;//sheng: fully associative cache tag and data must be computed together, data and tag must be separate
    int tag_num_c_subarray;
    int data_num_r_subarray;
    int data_num_c_subarray;
    int num_mats_h_dir;
    int num_mats_v_dir;
    uint32_t ram_cell_tech_type;
    double dram_refresh_period;
    DynamicParameter();
    DynamicParameter(
        bool         is_tag_,
        int          pure_ram_,
        int          pure_cam_,
        double       Nspd_,
        unsigned int Ndwl_,
        unsigned int Ndbl_,
        unsigned int Ndcm_,
        unsigned int Ndsam_lev_1_,
        unsigned int Ndsam_lev_2_,
        bool         is_main_mem_);
    int use_inp_params;
    unsigned int num_rw_ports;
    unsigned int num_rd_ports;
    unsigned int num_wr_ports;
    unsigned int num_se_rd_ports;  // number of single ended read ports
    unsigned int num_search_ports;
    unsigned int out_w;// == nr_bits_out
    bool   is_main_mem;
    Area   cell, cam_cell;//cell is the sram_cell in both nomal cache/ram and FA.
    bool   is_valid;
 };
 extern InputParameter * g_ip;
 extern TechnologyParameter g_tp;
 #endif
--- a/ext/mcpat/cacti/router.cc
+++ b/ext/mcpat/cacti/router.cc
@ -0,0 +1,311 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include "router.h"
 Router::Router(
    double flit_size_,
    double vc_buf, /* vc size = vc_buffer_size * flit_size */
    double vc_c,
    TechnologyParameter::DeviceType *dt,
    double I_,
    double O_,
    double M_
    ):flit_size(flit_size_),
      deviceType(dt),
      I(I_),
      O(O_),
      M(M_)
 {
  vc_buffer_size = vc_buf;
  vc_count = vc_c;
  min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
  double technology = g_ip->F_sz_um;
  Vdd = dt->Vdd;
  /*Crossbar parameters. Transmisson gate is employed for connector*/
  NTtr = 10*technology*1e-6/2; /*Transmission gate's nmos tr. length*/
  PTtr = 20*technology*1e-6/2; /* pmos tr. length*/
  wt = 15*technology*1e-6/2; /*track width*/
  ht = 15*technology*1e-6/2; /*track height*/
 //  I = 5; /*Number of crossbar input ports*/
 //  O = 5; /*Number of crossbar output ports*/
  NTi = 12.5*technology*1e-6/2;
  PTi = 25*technology*1e-6/2;
  NTid = 60*technology*1e-6/2; //m
  PTid = 120*technology*1e-6/2; // m
  NTod = 60*technology*1e-6/2; // m
  PTod = 120*technology*1e-6/2; // m
  calc_router_parameters();
 }
 Router::~Router(){}
 double //wire cap with triple spacing
 Router::Cw3(double length) {
  Wire wc(g_ip->wt, length, 1, 3, 3);
  return (wc.wire_cap(length));
 }
 /*Function to calculate the gate capacitance*/
 double
 Router::gate_cap(double w) {
  return (double) gate_C (w*1e6 /*u*/, 0);
 }
 /*Function to calculate the diffusion capacitance*/
 double
 Router::diff_cap(double w, int type /*0 for n-mos and 1 for p-mos*/,
    double s /*number of stacking transistors*/) {
  return (double) drain_C_(w*1e6 /*u*/, type, (int) s, 1, g_tp.cell_h_def);
 }
 /*crossbar related functions */
 // Model for simple transmission gate
 double
 Router::transmission_buf_inpcap() {
  return diff_cap(NTtr, 0, 1)+diff_cap(PTtr, 1, 1);
 }
 double
 Router::transmission_buf_outcap() {
  return diff_cap(NTtr, 0, 1)+diff_cap(PTtr, 1, 1);
 }
 double
 Router::transmission_buf_ctrcap() {
  return gate_cap(NTtr)+gate_cap(PTtr);
 }
 double
 Router::crossbar_inpline() {
  return (Cw3(O*flit_size*wt) + O*transmission_buf_inpcap() + gate_cap(NTid) +
      gate_cap(PTid) + diff_cap(NTid, 0, 1) + diff_cap(PTid, 1, 1));
 }
 double
 Router::crossbar_outline() {
  return (Cw3(I*flit_size*ht) + I*transmission_buf_outcap() + gate_cap(NTod) +
      gate_cap(PTod) + diff_cap(NTod, 0, 1) + diff_cap(PTod, 1, 1));
 }
 double
 Router::crossbar_ctrline() {
  return (Cw3(0.5*O*flit_size*wt) + flit_size*transmission_buf_ctrcap() +
      diff_cap(NTi, 0, 1) + diff_cap(PTi, 1, 1) +
      gate_cap(NTi) + gate_cap(PTi));
 }
 double
 Router::tr_crossbar_power() {
  return (crossbar_inpline()*Vdd*Vdd*flit_size/2 +
      crossbar_outline()*Vdd*Vdd*flit_size/2)*2;
 }
 void Router::buffer_stats()
 {
  DynamicParameter dyn_p;
  dyn_p.is_tag      = false;
  dyn_p.pure_cam    = false;
  dyn_p.fully_assoc = false;
  dyn_p.pure_ram    = true;
  dyn_p.is_dram     = false;
  dyn_p.is_main_mem = false;
  dyn_p.num_subarrays = 1;
  dyn_p.num_mats = 1;
  dyn_p.Ndbl = 1;
  dyn_p.Ndwl = 1;
  dyn_p.Nspd = 1;
  dyn_p.deg_bl_muxing = 1;
  dyn_p.deg_senseamp_muxing_non_associativity = 1;
  dyn_p.Ndsam_lev_1 = 1;
  dyn_p.Ndsam_lev_2 = 1;
  dyn_p.Ndcm = 1;
  dyn_p.number_addr_bits_mat = 8;
  dyn_p.number_way_select_signals_mat = 1;
  dyn_p.number_subbanks_decode = 0;
  dyn_p.num_act_mats_hor_dir = 1;
  dyn_p.V_b_sense = Vdd; // FIXME check power calc.
  dyn_p.ram_cell_tech_type = 0;
  dyn_p.num_r_subarray = (int) vc_buffer_size;
  dyn_p.num_c_subarray = (int) flit_size * (int) vc_count;
  dyn_p.num_mats_h_dir = 1;
  dyn_p.num_mats_v_dir = 1;
  dyn_p.num_do_b_subbank = (int)flit_size;
  dyn_p.num_di_b_subbank = (int)flit_size;
  dyn_p.num_do_b_mat = (int) flit_size;
  dyn_p.num_di_b_mat = (int) flit_size;
  dyn_p.num_do_b_mat = (int) flit_size;
  dyn_p.num_di_b_mat = (int) flit_size;
  dyn_p.num_do_b_bank_per_port = (int) flit_size;
  dyn_p.num_di_b_bank_per_port = (int) flit_size;
  dyn_p.out_w = (int) flit_size;
  dyn_p.use_inp_params = 1;
  dyn_p.num_wr_ports = (unsigned int) vc_count;
  dyn_p.num_rd_ports = 1;//(unsigned int) vc_count;//based on Bill Dally's book
  dyn_p.num_rw_ports = 0;
  dyn_p.num_se_rd_ports =0;
  dyn_p.num_search_ports =0;
  dyn_p.cell.h = g_tp.sram.b_h + 2 * g_tp.wire_outside_mat.pitch * (dyn_p.num_wr_ports +
      dyn_p.num_rw_ports - 1 + dyn_p.num_rd_ports);
  dyn_p.cell.w = g_tp.sram.b_w + 2 * g_tp.wire_outside_mat.pitch * (dyn_p.num_rw_ports - 1 +
      (dyn_p.num_rd_ports - dyn_p.num_se_rd_ports) +
      dyn_p.num_wr_ports) + g_tp.wire_outside_mat.pitch * dyn_p.num_se_rd_ports;
  Mat buff(dyn_p);
  buff.compute_delays(0);
  buff.compute_power_energy();
  buffer.power.readOp  = buff.power.readOp;
  buffer.power.writeOp = buffer.power.readOp; //FIXME
  buffer.area = buff.area;
 }
  void
 Router::cb_stats ()
 {
  if (1) {
    Crossbar c_b(I, O, flit_size);
    c_b.compute_power();
    crossbar.delay = c_b.delay;
    crossbar.power.readOp.dynamic = c_b.power.readOp.dynamic;
    crossbar.power.readOp.leakage = c_b.power.readOp.leakage;
    crossbar.power.readOp.gate_leakage = c_b.power.readOp.gate_leakage;
    crossbar.area = c_b.area;
 //  c_b.print_crossbar();
  }
  else {
    crossbar.power.readOp.dynamic = tr_crossbar_power();
    crossbar.power.readOp.leakage = flit_size * I * O *
        cmos_Isub_leakage(NTtr*g_tp.min_w_nmos_, PTtr*min_w_pmos, 1, tg);
    crossbar.power.readOp.gate_leakage = flit_size * I * O *
        cmos_Ig_leakage(NTtr*g_tp.min_w_nmos_, PTtr*min_w_pmos, 1, tg);
  }
 }
 void
 Router::get_router_power()
 {
  /* calculate buffer stats */
  buffer_stats();
  /* calculate cross-bar stats */
  cb_stats();
  /* calculate arbiter stats */
  Arbiter vcarb(vc_count, flit_size, buffer.area.w);
  Arbiter cbarb(I, flit_size, crossbar.area.w);
  vcarb.compute_power();
  cbarb.compute_power();
  arbiter.power.readOp.dynamic = vcarb.power.readOp.dynamic * I +
    cbarb.power.readOp.dynamic * O;
  arbiter.power.readOp.leakage = vcarb.power.readOp.leakage * I +
    cbarb.power.readOp.leakage * O;
  arbiter.power.readOp.gate_leakage = vcarb.power.readOp.gate_leakage * I +
    cbarb.power.readOp.gate_leakage * O;
 //  arb_stats();
  power.readOp.dynamic = ((buffer.power.readOp.dynamic+buffer.power.writeOp.dynamic) +
                  crossbar.power.readOp.dynamic +
                  arbiter.power.readOp.dynamic)*MIN(I, O)*M;
  double pppm_t[4]    = {1,I,I,1};
  power = power + (buffer.power*pppm_t + crossbar.power + arbiter.power)*pppm_lkg;
 }
  void
 Router::get_router_delay ()
 {
  FREQUENCY=5; // move this to config file --TODO
  cycle_time = (1/(double)FREQUENCY)*1e3; //ps
  delay = 4;
  max_cyc = 17 * g_tp.FO4; //s
  max_cyc *= 1e12; //ps
  if (cycle_time < max_cyc) {
    FREQUENCY = (1/max_cyc)*1e3; //GHz
  }
 }
  void
 Router::get_router_area()
 {
  area.h = I*buffer.area.h;
  area.w = buffer.area.w+crossbar.area.w;
 }
  void
 Router::calc_router_parameters()
 {
  /* calculate router frequency and pipeline cycles */
  get_router_delay();
  /* router power stats */
  get_router_power();
  /* area stats */
  get_router_area();
 }
  void
 Router::print_router()
 {
  cout << "\n\nRouter stats:\n";
  cout << "\tRouter Area - "<< area.get_area()*1e-6<<"(mm^2)\n";
  cout << "\tMaximum possible network frequency - " << (1/max_cyc)*1e3 << "GHz\n";
  cout << "\tNetwork frequency - " << FREQUENCY <<" GHz\n";
  cout << "\tNo. of Virtual channels - " << vc_count << "\n";
  cout << "\tNo. of pipeline stages - " << delay << endl;
  cout << "\tLink bandwidth - " << flit_size << " (bits)\n";
  cout << "\tNo. of buffer entries per virtual channel -  "<< vc_buffer_size << "\n";
  cout << "\tSimple buffer Area - "<< buffer.area.get_area()*1e-6<<"(mm^2)\n";
  cout << "\tSimple buffer access (Read) - " << buffer.power.readOp.dynamic * 1e9 <<" (nJ)\n";
  cout << "\tSimple buffer leakage - " << buffer.power.readOp.leakage * 1e3 <<" (mW)\n";
  cout << "\tCrossbar Area - "<< crossbar.area.get_area()*1e-6<<"(mm^2)\n";
  cout << "\tCross bar access energy - " << crossbar.power.readOp.dynamic * 1e9<<" (nJ)\n";
  cout << "\tCross bar leakage power - " << crossbar.power.readOp.leakage * 1e3<<" (mW)\n";
  cout << "\tArbiter access energy (VC arb + Crossbar arb) - "<<arbiter.power.readOp.dynamic * 1e9 <<" (nJ)\n";
  cout << "\tArbiter leakage (VC arb + Crossbar arb) - "<<arbiter.power.readOp.leakage * 1e3 <<" (mW)\n";
 }
--- a/ext/mcpat/cacti/router.h
+++ b/ext/mcpat/cacti/router.h
@ -0,0 +1,115 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __ROUTER_H__
 #define __ROUTER_H__
 #include <assert.h>
 #include <iostream>
 #include "arbiter.h"
 #include "basic_circuit.h"
 #include "cacti_interface.h"
 #include "component.h"
 #include "crossbar.h"
 #include "mat.h"
 #include "parameter.h"
 #include "wire.h"
 class Router : public Component
 {
  public:
    Router(
        double flit_size_,
        double vc_buf, /* vc size = vc_buffer_size * flit_size */
        double vc_count,
        TechnologyParameter::DeviceType *dt = &(g_tp.peri_global),
        double I_ = 5,
        double O_ = 5,
        double M_ = 0.6);
    ~Router();
    void print_router();
    Component arbiter, crossbar, buffer;
    double cycle_time, max_cyc;
    double flit_size;
    double vc_count;
    double vc_buffer_size; /* vc size = vc_buffer_size * flit_size */
  private:
        TechnologyParameter::DeviceType *deviceType;
        double FREQUENCY; // move this to config file --TODO
    double Cw3(double len);
    double gate_cap(double w);
    double diff_cap(double w, int type /*0 for n-mos and 1 for p-mos*/, double stack);
    enum Wire_type wtype;
    enum Wire_placement wire_placement;
    //corssbar
    double NTtr, PTtr, wt, ht, I, O, NTi, PTi, NTid, PTid, NTod, PTod, TriS1, TriS2;
    double M; //network load
    double transmission_buf_inpcap();
    double transmission_buf_outcap();
    double transmission_buf_ctrcap();
    double crossbar_inpline();
    double crossbar_outline();
    double crossbar_ctrline();
    double tr_crossbar_power();
    void  cb_stats ();
    double arb_power();
    void  arb_stats ();
    double buffer_params();
    void buffer_stats();
    //arbiter
    //buffer
    //router params
    double Vdd;
    void calc_router_parameters();
    void get_router_area();
    void get_router_power();
    void get_router_delay();
    double min_w_pmos;
 };
 #endif
--- a/ext/mcpat/cacti/subarray.cc
+++ b/ext/mcpat/cacti/subarray.cc
@ -0,0 +1,196 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <cassert>
 #include <cmath>
 #include <iostream>
 #include "subarray.h"
 Subarray::Subarray(const DynamicParameter & dp_, bool is_fa_):
  dp(dp_), num_rows(dp.num_r_subarray), num_cols(dp.num_c_subarray),
  num_cols_fa_cam(dp.tag_num_c_subarray), num_cols_fa_ram(dp.data_num_c_subarray),
  cell(dp.cell), cam_cell(dp.cam_cell), is_fa(is_fa_)
 {
        //num_cols=7;
        //cout<<"num_cols ="<< num_cols <<endl;
  if (!(is_fa || dp.pure_cam))
  {
          num_cols +=(g_ip->add_ecc_b_ ? (int)ceil(num_cols / num_bits_per_ecc_b_) : 0);   // ECC overhead
          uint32_t ram_num_cells_wl_stitching =
                  (dp.ram_cell_tech_type == lp_dram)   ? dram_num_cells_wl_stitching_ :
          (dp.ram_cell_tech_type == comm_dram) ? comm_dram_num_cells_wl_stitching_ : sram_num_cells_wl_stitching_;
          area.h = cell.h * num_rows;
          area.w = cell.w * num_cols +
          ceil(num_cols / ram_num_cells_wl_stitching) * g_tp.ram_wl_stitching_overhead_;  // stitching overhead
  }
  else  //cam fa
  {
          //should not add dummy row here since the dummy row do not need decoder
          if (is_fa)// fully associative cache
          {
                  num_cols_fa_cam  += g_ip->add_ecc_b_ ? (int)ceil(num_cols_fa_cam / num_bits_per_ecc_b_) : 0;
                  num_cols_fa_ram  += (g_ip->add_ecc_b_ ? (int)ceil(num_cols_fa_ram / num_bits_per_ecc_b_) : 0);
                  num_cols = num_cols_fa_cam + num_cols_fa_ram;
          }
          else
          {
                  num_cols_fa_cam  += g_ip->add_ecc_b_ ? (int)ceil(num_cols_fa_cam / num_bits_per_ecc_b_) : 0;
                  num_cols_fa_ram  = 0;
                  num_cols = num_cols_fa_cam;
          }
          area.h = cam_cell.h * (num_rows + 1);//height of subarray is decided by CAM array. blank space in sram array are filled with dummy cells
          area.w = cam_cell.w * num_cols_fa_cam + cell.w * num_cols_fa_ram
          + ceil((num_cols_fa_cam + num_cols_fa_ram) / sram_num_cells_wl_stitching_)*g_tp.ram_wl_stitching_overhead_
          + 16*g_tp.wire_local.pitch //the overhead for the NAND gate to connect the two halves
          + 128*g_tp.wire_local.pitch;//the overhead for the drivers from matchline to wordline of RAM
  }
  assert(area.h>0);
  assert(area.w>0);
  compute_C();
 }
 Subarray::~Subarray()
 {
 }
 double Subarray::get_total_cell_area()
 {
 //  return (is_fa==false? cell.get_area() * num_rows * num_cols
 //		  //: cam_cell.h*(num_rows+1)*(num_cols_fa_cam + sram_cell.get_area()*num_cols_fa_ram));
 //		  : cam_cell.get_area()*(num_rows+1)*(num_cols_fa_cam + num_cols_fa_ram));
 //		  //: cam_cell.get_area()*(num_rows+1)*num_cols_fa_cam + sram_cell.get_area()*(num_rows+1)*num_cols_fa_ram);//for FA, this area does not include the dummy cells in SRAM arrays.
    if (!(is_fa || dp.pure_cam))
          return (cell.get_area() * num_rows * num_cols);
    else if (is_fa)
    { //for FA, this area includes the dummy cells in SRAM arrays.
      //return (cam_cell.get_area()*(num_rows+1)*(num_cols_fa_cam + num_cols_fa_ram));
      //cout<<"diff" <<cam_cell.get_area()*(num_rows+1)*(num_cols_fa_cam + num_cols_fa_ram)- cam_cell.h*(num_rows+1)*(cam_cell.w*num_cols_fa_cam + cell.w*num_cols_fa_ram)<<endl;
      return (cam_cell.h*(num_rows+1)*(cam_cell.w*num_cols_fa_cam + cell.w*num_cols_fa_ram));
    }
    else
      return (cam_cell.get_area()*(num_rows+1)*num_cols_fa_cam );
 }
 void Subarray::compute_C()
 {
  double c_w_metal = cell.w * g_tp.wire_local.C_per_um;
  double r_w_metal = cell.w * g_tp.wire_local.R_per_um;
  double C_b_metal = cell.h * g_tp.wire_local.C_per_um;
  double C_b_row_drain_C;
  if (dp.is_dram)
  {
    C_wl = (gate_C_pass(g_tp.dram.cell_a_w, g_tp.dram.b_w, true, true) + c_w_metal) * num_cols;
    if (dp.ram_cell_tech_type == comm_dram)
    {
      C_bl = num_rows * C_b_metal;
    }
    else
    {
      C_b_row_drain_C = drain_C_(g_tp.dram.cell_a_w, NCH, 1, 0, cell.w, true, true) / 2.0;  // due to shared contact
      C_bl = num_rows * (C_b_row_drain_C + C_b_metal);
    }
  }
  else
  {
          if (!(is_fa ||dp.pure_cam))
          {
                  C_wl = (gate_C_pass(g_tp.sram.cell_a_w, (g_tp.sram.b_w-2*g_tp.sram.cell_a_w)/2.0, false, true)*2 +
                                  c_w_metal) * num_cols;
                  C_b_row_drain_C = drain_C_(g_tp.sram.cell_a_w, NCH, 1, 0, cell.w, false, true) / 2.0;  // due to shared contact
                  C_bl = num_rows * (C_b_row_drain_C + C_b_metal);
          }
          else
          {
                 //Following is wordline not matchline
                 //CAM portion
                 c_w_metal = cam_cell.w * g_tp.wire_local.C_per_um;
                 r_w_metal = cam_cell.w * g_tp.wire_local.R_per_um;
         C_wl_cam = (gate_C_pass(g_tp.cam.cell_a_w, (g_tp.cam.b_w-2*g_tp.cam.cell_a_w)/2.0, false, true)*2 +
                                  c_w_metal) * num_cols_fa_cam;
         R_wl_cam = (r_w_metal) * num_cols_fa_cam;
         if (!dp.pure_cam)
         {
                 //RAM portion
                 c_w_metal = cell.w * g_tp.wire_local.C_per_um;
                 r_w_metal = cell.w * g_tp.wire_local.R_per_um;
                 C_wl_ram = (gate_C_pass(g_tp.sram.cell_a_w, (g_tp.sram.b_w-2*g_tp.sram.cell_a_w)/2.0, false, true)*2 +
                                 c_w_metal) * num_cols_fa_ram;
                 R_wl_ram = (r_w_metal) * num_cols_fa_ram;
         }
         else
         {
                 C_wl_ram = R_wl_ram =0;
         }
         C_wl = C_wl_cam + C_wl_ram;
         C_wl += (16+128)*g_tp.wire_local.pitch*g_tp.wire_local.C_per_um;
         R_wl = R_wl_cam + R_wl_ram;
         R_wl += (16+128)*g_tp.wire_local.pitch*g_tp.wire_local.R_per_um;
         //there are two ways to write to a FA,
         //1) Write to CAM array then force a match on match line to active the corresponding wordline in RAM;
         //2) using separate wordline for read/write and search in RAM.
         //We are using the second approach.
         //Bitline CAM portion This is bitline not searchline. We assume no sharing between bitline and searchline according to SUN's implementations.
         C_b_metal = cam_cell.h * g_tp.wire_local.C_per_um;
         C_b_row_drain_C = drain_C_(g_tp.cam.cell_a_w, NCH, 1, 0, cam_cell.w, false, true) / 2.0;  // due to shared contact
         C_bl_cam = (num_rows+1) * (C_b_row_drain_C + C_b_metal);
         //height of subarray is decided by CAM array. blank space in sram array are filled with dummy cells
         C_b_row_drain_C = drain_C_(g_tp.sram.cell_a_w, NCH, 1, 0, cell.w, false, true) / 2.0;  // due to shared contact
         C_bl = (num_rows +1) * (C_b_row_drain_C + C_b_metal);
          }
  }
 }
--- a/ext/mcpat/cacti/subarray.h
+++ b/ext/mcpat/cacti/subarray.h
@ -0,0 +1,70 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __SUBARRAY_H__
 #define __SUBARRAY_H__
 #include "area.h"
 #include "component.h"
 #include "parameter.h"
 using namespace std;
 class Subarray : public Component
 {
  public:
    Subarray(const DynamicParameter & dp, bool is_fa_);
    ~Subarray();
    const DynamicParameter & dp;
    double  get_total_cell_area();
    unsigned int num_rows;
    unsigned int num_cols;
    int32_t num_cols_fa_cam;
    int32_t num_cols_fa_ram;
    Area    cell, cam_cell;
    bool    is_fa;
    double  C_wl, C_wl_cam, C_wl_ram;
    double  R_wl, R_wl_cam, R_wl_ram;
    double  C_bl, C_bl_cam;
  private:
    void compute_C();  // compute bitline and wordline capacitance
 };
 #endif
--- a/ext/mcpat/cacti/technology.cc
+++ b/ext/mcpat/cacti/technology.cc
--- a/ext/mcpat/cacti/uca.cc
+++ b/ext/mcpat/cacti/uca.cc
@ -0,0 +1,426 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <cmath>
 #include <iostream>
 #include "uca.h"
 UCA::UCA(const DynamicParameter & dyn_p)
 :dp(dyn_p), bank(dp), nbanks(g_ip->nbanks), refresh_power(0)
 {
  int num_banks_ver_dir = 1 << ((bank.area.h > bank.area.w) ? _log2(nbanks)/2 : (_log2(nbanks) - _log2(nbanks)/2));
  int num_banks_hor_dir = nbanks/num_banks_ver_dir;
  if (dp.use_inp_params)
  {
          RWP  = dp.num_rw_ports;
          ERP  = dp.num_rd_ports;
          EWP  = dp.num_wr_ports;
          SCHP = dp.num_search_ports;
  }
  else
  {
          RWP  = g_ip->num_rw_ports;
          ERP  = g_ip->num_rd_ports;
          EWP  = g_ip->num_wr_ports;
          SCHP = g_ip->num_search_ports;
  }
  num_addr_b_bank = (dp.number_addr_bits_mat + dp.number_subbanks_decode)*(RWP+ERP+EWP);
  num_di_b_bank   = dp.num_di_b_bank_per_port * (RWP + EWP);
  num_do_b_bank   = dp.num_do_b_bank_per_port * (RWP + ERP);
  num_si_b_bank   = dp.num_si_b_bank_per_port * SCHP;
  num_so_b_bank   = dp.num_so_b_bank_per_port * SCHP;
  if (!dp.fully_assoc && !dp.pure_cam)
  {
          if (g_ip->fast_access && dp.is_tag == false)
          {
                  num_do_b_bank *= g_ip->data_assoc;
          }
          htree_in_add   = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
                          num_addr_b_bank, num_di_b_bank,0, num_do_b_bank,0,num_banks_ver_dir*2, num_banks_hor_dir*2, Add_htree, true);
          htree_in_data  = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
                          num_addr_b_bank, num_di_b_bank, 0, num_do_b_bank, 0, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_in_htree, true);
          htree_out_data = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
                          num_addr_b_bank, num_di_b_bank, 0, num_do_b_bank, 0, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_out_htree, true);
  }
  else
  {
          htree_in_add   = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
                          num_addr_b_bank, num_di_b_bank, num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Add_htree, true);
          htree_in_data  = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
                          num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_in_htree, true);
          htree_out_data = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
                          num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_out_htree, true);
          htree_in_search  = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
                          num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_in_htree, true);
          htree_out_search = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
                          num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_out_htree, true);
  }
  area.w = htree_in_data->area.w;
  area.h = htree_in_data->area.h;
  area_all_dataramcells = bank.mat.subarray.get_total_cell_area() * dp.num_subarrays * g_ip->nbanks;
 //  cout<<"area cell"<<area_all_dataramcells<<endl;
 //  cout<<area.get_area()<<endl;
  // delay calculation
  double inrisetime = 0.0;
  compute_delays(inrisetime);
  compute_power_energy();
 }
 UCA::~UCA()
 {
  delete htree_in_add;
  delete htree_in_data;
  delete htree_out_data;
 }
 double UCA::compute_delays(double inrisetime)
 {
  double outrisetime = bank.compute_delays(inrisetime);
  double delay_array_to_mat = htree_in_add->delay + bank.htree_in_add->delay;
  double max_delay_before_row_decoder = delay_array_to_mat + bank.mat.r_predec->delay;
  delay_array_to_sa_mux_lev_1_decoder = delay_array_to_mat +
    bank.mat.sa_mux_lev_1_predec->delay +
    bank.mat.sa_mux_lev_1_dec->delay;
  delay_array_to_sa_mux_lev_2_decoder = delay_array_to_mat +
    bank.mat.sa_mux_lev_2_predec->delay +
    bank.mat.sa_mux_lev_2_dec->delay;
  double delay_inside_mat = bank.mat.row_dec->delay + bank.mat.delay_bitline + bank.mat.delay_sa;
  delay_before_subarray_output_driver =
    MAX(MAX(max_delay_before_row_decoder + delay_inside_mat,  // row_path
            delay_array_to_mat + bank.mat.b_mux_predec->delay + bank.mat.bit_mux_dec->delay + bank.mat.delay_sa),  // col_path
        MAX(delay_array_to_sa_mux_lev_1_decoder,    // sa_mux_lev_1_path
            delay_array_to_sa_mux_lev_2_decoder));  // sa_mux_lev_2_path
  delay_from_subarray_out_drv_to_out = bank.mat.delay_subarray_out_drv_htree +
                                       bank.htree_out_data->delay + htree_out_data->delay;
  access_time                        = bank.mat.delay_comparator;
  double ram_delay_inside_mat;
  if (dp.fully_assoc)
  {
    //delay of FA contains both CAM tag and RAM data
    { //delay of CAM
      ram_delay_inside_mat = bank.mat.delay_bitline + bank.mat.delay_matchchline;
      access_time = htree_in_add->delay + bank.htree_in_add->delay;
      //delay of fully-associative data array
      access_time += ram_delay_inside_mat + delay_from_subarray_out_drv_to_out;
    }
  }
  else
  {
    access_time = delay_before_subarray_output_driver + delay_from_subarray_out_drv_to_out; //data_acc_path
  }
  if (dp.is_main_mem)
  {
    double t_rcd       = max_delay_before_row_decoder + delay_inside_mat;
    double cas_latency = MAX(delay_array_to_sa_mux_lev_1_decoder, delay_array_to_sa_mux_lev_2_decoder) +
                         delay_from_subarray_out_drv_to_out;
    access_time = t_rcd + cas_latency;
  }
  double temp;
  if (!dp.fully_assoc)
  {
    temp = delay_inside_mat + bank.mat.delay_wl_reset + bank.mat.delay_bl_restore;//TODO: Sheng: revisit
   if (dp.is_dram)
    {
      temp += bank.mat.delay_writeback;  // temp stores random cycle time
    }
  temp = MAX(temp, bank.mat.r_predec->delay);
  temp = MAX(temp, bank.mat.b_mux_predec->delay);
  temp = MAX(temp, bank.mat.sa_mux_lev_1_predec->delay);
  temp = MAX(temp, bank.mat.sa_mux_lev_2_predec->delay);
  }
  else
   {
          ram_delay_inside_mat = bank.mat.delay_bitline + bank.mat.delay_matchchline;
          temp = ram_delay_inside_mat + bank.mat.delay_cam_sl_restore + bank.mat.delay_cam_ml_reset + bank.mat.delay_bl_restore
                 + bank.mat.delay_hit_miss_reset + bank.mat.delay_wl_reset;
          temp = MAX(temp, bank.mat.b_mux_predec->delay);//TODO: Sheng revisit whether distinguish cam and ram bitline etc.
          temp = MAX(temp, bank.mat.sa_mux_lev_1_predec->delay);
          temp = MAX(temp, bank.mat.sa_mux_lev_2_predec->delay);
   }
  // The following is true only if the input parameter "repeaters_in_htree" is set to false --Nav
  if (g_ip->rpters_in_htree == false)
  {
    temp = MAX(temp, bank.htree_in_add->max_unpipelined_link_delay);
  }
  cycle_time = temp;
  double delay_req_network = max_delay_before_row_decoder;
  double delay_rep_network = delay_from_subarray_out_drv_to_out;
  multisubbank_interleave_cycle_time = MAX(delay_req_network, delay_rep_network);
  if (dp.is_main_mem)
  {
    multisubbank_interleave_cycle_time = htree_in_add->delay;
    precharge_delay = htree_in_add->delay +
                      bank.htree_in_add->delay + bank.mat.delay_writeback +
                      bank.mat.delay_wl_reset + bank.mat.delay_bl_restore;
    cycle_time = access_time + precharge_delay;
  }
  else
  {
    precharge_delay = 0;
  }
  double dram_array_availability = 0;
  if (dp.is_dram)
  {
    dram_array_availability = (1 - dp.num_r_subarray * cycle_time / dp.dram_refresh_period) * 100;
  }
  return outrisetime;
 }
 // note: currently, power numbers are for a bank of an array
 void UCA::compute_power_energy()
 {
  bank.compute_power_energy();
  power = bank.power;
  power_routing_to_bank.readOp.dynamic  = htree_in_add->power.readOp.dynamic + htree_out_data->power.readOp.dynamic;
  power_routing_to_bank.writeOp.dynamic = htree_in_add->power.readOp.dynamic + htree_in_data->power.readOp.dynamic;
  if (dp.fully_assoc || dp.pure_cam)
      power_routing_to_bank.searchOp.dynamic= htree_in_search->power.searchOp.dynamic + htree_out_search->power.searchOp.dynamic;
  power_routing_to_bank.readOp.leakage += htree_in_add->power.readOp.leakage +
                                          htree_in_data->power.readOp.leakage +
                                          htree_out_data->power.readOp.leakage;
  power_routing_to_bank.readOp.gate_leakage += htree_in_add->power.readOp.gate_leakage +
                                          htree_in_data->power.readOp.gate_leakage +
                                          htree_out_data->power.readOp.gate_leakage;
  if (dp.fully_assoc || dp.pure_cam)
  {
        power_routing_to_bank.readOp.leakage += htree_in_search->power.readOp.leakage + htree_out_search->power.readOp.leakage;
        power_routing_to_bank.readOp.gate_leakage += htree_in_search->power.readOp.gate_leakage + htree_out_search->power.readOp.gate_leakage;
  }
  power.searchOp.dynamic += power_routing_to_bank.searchOp.dynamic;
  power.readOp.dynamic += power_routing_to_bank.readOp.dynamic;
  power.readOp.leakage += power_routing_to_bank.readOp.leakage;
  power.readOp.gate_leakage += power_routing_to_bank.readOp.gate_leakage;
  // calculate total write energy per access
  power.writeOp.dynamic = power.readOp.dynamic
                        - bank.mat.power_bitline.readOp.dynamic * dp.num_act_mats_hor_dir
                        + bank.mat.power_bitline.writeOp.dynamic * dp.num_act_mats_hor_dir
                        - power_routing_to_bank.readOp.dynamic
                        + power_routing_to_bank.writeOp.dynamic
                        + bank.htree_in_data->power.readOp.dynamic
                        - bank.htree_out_data->power.readOp.dynamic;
  if (dp.is_dram == false)
  {
    power.writeOp.dynamic -= bank.mat.power_sa.readOp.dynamic * dp.num_act_mats_hor_dir;
  }
  dyn_read_energy_from_closed_page = power.readOp.dynamic;
  dyn_read_energy_from_open_page   = power.readOp.dynamic -
                                     (bank.mat.r_predec->power.readOp.dynamic +
                                      bank.mat.power_row_decoders.readOp.dynamic +
                                      bank.mat.power_bl_precharge_eq_drv.readOp.dynamic +
                                      bank.mat.power_sa.readOp.dynamic +
                                      bank.mat.power_bitline.readOp.dynamic) * dp.num_act_mats_hor_dir;
  dyn_read_energy_remaining_words_in_burst =
    (MAX((g_ip->burst_len / g_ip->int_prefetch_w), 1) - 1) *
    ((bank.mat.sa_mux_lev_1_predec->power.readOp.dynamic +
      bank.mat.sa_mux_lev_2_predec->power.readOp.dynamic +
      bank.mat.power_sa_mux_lev_1_decoders.readOp.dynamic +
      bank.mat.power_sa_mux_lev_2_decoders.readOp.dynamic +
      bank.mat.power_subarray_out_drv.readOp.dynamic)     * dp.num_act_mats_hor_dir +
     bank.htree_out_data->power.readOp.dynamic +
     power_routing_to_bank.readOp.dynamic);
  dyn_read_energy_from_closed_page += dyn_read_energy_remaining_words_in_burst;
  dyn_read_energy_from_open_page   += dyn_read_energy_remaining_words_in_burst;
  activate_energy = htree_in_add->power.readOp.dynamic +
                    bank.htree_in_add->power_bit.readOp.dynamic * bank.num_addr_b_routed_to_mat_for_act +
                    (bank.mat.r_predec->power.readOp.dynamic +
                     bank.mat.power_row_decoders.readOp.dynamic +
                     bank.mat.power_sa.readOp.dynamic) * dp.num_act_mats_hor_dir;
  read_energy    = (htree_in_add->power.readOp.dynamic +
                    bank.htree_in_add->power_bit.readOp.dynamic * bank.num_addr_b_routed_to_mat_for_rd_or_wr +
                    (bank.mat.sa_mux_lev_1_predec->power.readOp.dynamic  +
                     bank.mat.sa_mux_lev_2_predec->power.readOp.dynamic  +
                     bank.mat.power_sa_mux_lev_1_decoders.readOp.dynamic +
                     bank.mat.power_sa_mux_lev_2_decoders.readOp.dynamic +
                     bank.mat.power_subarray_out_drv.readOp.dynamic) * dp.num_act_mats_hor_dir +
                    bank.htree_out_data->power.readOp.dynamic +
                    htree_in_data->power.readOp.dynamic) * g_ip->burst_len;
  write_energy   = (htree_in_add->power.readOp.dynamic +
                    bank.htree_in_add->power_bit.readOp.dynamic * bank.num_addr_b_routed_to_mat_for_rd_or_wr +
                    htree_in_data->power.readOp.dynamic +
                    bank.htree_in_data->power.readOp.dynamic +
                    (bank.mat.sa_mux_lev_1_predec->power.readOp.dynamic  +
                     bank.mat.sa_mux_lev_2_predec->power.readOp.dynamic  +
                     bank.mat.power_sa_mux_lev_1_decoders.readOp.dynamic +
                     bank.mat.power_sa_mux_lev_2_decoders.readOp.dynamic) * dp.num_act_mats_hor_dir) * g_ip->burst_len;
  precharge_energy = (bank.mat.power_bitline.readOp.dynamic +
                      bank.mat.power_bl_precharge_eq_drv.readOp.dynamic) * dp.num_act_mats_hor_dir;
  leak_power_subbank_closed_page =
    (bank.mat.r_predec->power.readOp.leakage +
     bank.mat.b_mux_predec->power.readOp.leakage +
     bank.mat.sa_mux_lev_1_predec->power.readOp.leakage +
     bank.mat.sa_mux_lev_2_predec->power.readOp.leakage +
     bank.mat.power_row_decoders.readOp.leakage +
     bank.mat.power_bit_mux_decoders.readOp.leakage +
     bank.mat.power_sa_mux_lev_1_decoders.readOp.leakage +
     bank.mat.power_sa_mux_lev_2_decoders.readOp.leakage +
     bank.mat.leak_power_sense_amps_closed_page_state) * dp.num_act_mats_hor_dir;
  leak_power_subbank_closed_page +=
    (bank.mat.r_predec->power.readOp.gate_leakage +
     bank.mat.b_mux_predec->power.readOp.gate_leakage +
     bank.mat.sa_mux_lev_1_predec->power.readOp.gate_leakage +
     bank.mat.sa_mux_lev_2_predec->power.readOp.gate_leakage +
     bank.mat.power_row_decoders.readOp.gate_leakage +
     bank.mat.power_bit_mux_decoders.readOp.gate_leakage +
     bank.mat.power_sa_mux_lev_1_decoders.readOp.gate_leakage +
     bank.mat.power_sa_mux_lev_2_decoders.readOp.gate_leakage) * dp.num_act_mats_hor_dir; //+
     //bank.mat.leak_power_sense_amps_closed_page_state) * dp.num_act_mats_hor_dir;
  leak_power_subbank_open_page =
    (bank.mat.r_predec->power.readOp.leakage +
     bank.mat.b_mux_predec->power.readOp.leakage +
     bank.mat.sa_mux_lev_1_predec->power.readOp.leakage +
     bank.mat.sa_mux_lev_2_predec->power.readOp.leakage +
     bank.mat.power_row_decoders.readOp.leakage +
     bank.mat.power_bit_mux_decoders.readOp.leakage +
     bank.mat.power_sa_mux_lev_1_decoders.readOp.leakage +
     bank.mat.power_sa_mux_lev_2_decoders.readOp.leakage +
     bank.mat.leak_power_sense_amps_open_page_state) * dp.num_act_mats_hor_dir;
  leak_power_subbank_open_page +=
    (bank.mat.r_predec->power.readOp.gate_leakage +
     bank.mat.b_mux_predec->power.readOp.gate_leakage +
     bank.mat.sa_mux_lev_1_predec->power.readOp.gate_leakage +
     bank.mat.sa_mux_lev_2_predec->power.readOp.gate_leakage +
     bank.mat.power_row_decoders.readOp.gate_leakage +
     bank.mat.power_bit_mux_decoders.readOp.gate_leakage +
     bank.mat.power_sa_mux_lev_1_decoders.readOp.gate_leakage +
     bank.mat.power_sa_mux_lev_2_decoders.readOp.gate_leakage ) * dp.num_act_mats_hor_dir;
     //bank.mat.leak_power_sense_amps_open_page_state) * dp.num_act_mats_hor_dir;
  leak_power_request_and_reply_networks =
    power_routing_to_bank.readOp.leakage +
    bank.htree_in_add->power.readOp.leakage +
    bank.htree_in_data->power.readOp.leakage +
    bank.htree_out_data->power.readOp.leakage;
  leak_power_request_and_reply_networks +=
    power_routing_to_bank.readOp.gate_leakage +
    bank.htree_in_add->power.readOp.gate_leakage +
    bank.htree_in_data->power.readOp.gate_leakage +
    bank.htree_out_data->power.readOp.gate_leakage;
  if (dp.fully_assoc || dp.pure_cam)
  {
        leak_power_request_and_reply_networks += htree_in_search->power.readOp.leakage + htree_out_search->power.readOp.leakage;
        leak_power_request_and_reply_networks += htree_in_search->power.readOp.gate_leakage + htree_out_search->power.readOp.gate_leakage;
  }
  if (dp.is_dram)
  { // if DRAM, add contribution of power spent in row predecoder drivers, blocks and decoders to refresh power
    refresh_power  = (bank.mat.r_predec->power.readOp.dynamic * dp.num_act_mats_hor_dir +
                      bank.mat.row_dec->power.readOp.dynamic) * dp.num_r_subarray * dp.num_subarrays;
    refresh_power += bank.mat.per_bitline_read_energy * dp.num_c_subarray * dp.num_r_subarray * dp.num_subarrays;
    refresh_power += bank.mat.power_bl_precharge_eq_drv.readOp.dynamic * dp.num_act_mats_hor_dir;
    refresh_power += bank.mat.power_sa.readOp.dynamic * dp.num_act_mats_hor_dir;
    refresh_power /= dp.dram_refresh_period;
  }
  if (dp.is_tag == false)
  {
    power.readOp.dynamic  = dyn_read_energy_from_closed_page;
    power.writeOp.dynamic = dyn_read_energy_from_closed_page
      - dyn_read_energy_remaining_words_in_burst
      - bank.mat.power_bitline.readOp.dynamic * dp.num_act_mats_hor_dir
      + bank.mat.power_bitline.writeOp.dynamic * dp.num_act_mats_hor_dir
      + (power_routing_to_bank.writeOp.dynamic -
         power_routing_to_bank.readOp.dynamic -
         bank.htree_out_data->power.readOp.dynamic +
         bank.htree_in_data->power.readOp.dynamic) *
        (MAX((g_ip->burst_len / g_ip->int_prefetch_w), 1) - 1); //FIXME
    if (dp.is_dram == false)
    {
      power.writeOp.dynamic -= bank.mat.power_sa.readOp.dynamic * dp.num_act_mats_hor_dir;
    }
  }
  // if DRAM, add refresh power to total leakage
  if (dp.is_dram)
  {
    power.readOp.leakage += refresh_power;
  }
  // TODO: below should be  avoided.
  /*if (dp.is_main_mem)
  {
    power.readOp.leakage += MAIN_MEM_PER_CHIP_STANDBY_CURRENT_mA * 1e-3 * g_tp.peri_global.Vdd / g_ip->nbanks;
  }*/
  assert(power.readOp.dynamic  > 0);
  assert(power.writeOp.dynamic > 0);
  assert(power.readOp.leakage  > 0);
 }
--- a/ext/mcpat/cacti/uca.h
+++ b/ext/mcpat/cacti/uca.h
@ -0,0 +1,95 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __UCA_H__
 #define __UCA_H__
 #include "area.h"
 #include "bank.h"
 #include "component.h"
 #include "htree2.h"
 #include "parameter.h"
 class UCA : public Component
 {
  public:
    UCA(const DynamicParameter & dyn_p);
    ~UCA();
    double compute_delays(double inrisetime);  // returns outrisetime
    void   compute_power_energy();
    DynamicParameter dp;
    Bank   bank;
    Htree2   * htree_in_add;
    Htree2   * htree_in_data;
    Htree2   * htree_out_data;
    Htree2   * htree_in_search;
    Htree2   * htree_out_search;
    powerDef power_routing_to_bank;
    uint32_t nbanks;
    int   num_addr_b_bank;
    int   num_di_b_bank;
    int   num_do_b_bank;
    int   num_si_b_bank;
    int   num_so_b_bank;
    int   RWP, ERP, EWP,SCHP;
    double area_all_dataramcells;
    double dyn_read_energy_from_closed_page;
    double dyn_read_energy_from_open_page;
    double dyn_read_energy_remaining_words_in_burst;
    double refresh_power;  // only for DRAM
    double activate_energy;
    double read_energy;
    double write_energy;
    double precharge_energy;
    double leak_power_subbank_closed_page;
    double leak_power_subbank_open_page;
    double leak_power_request_and_reply_networks;
    double delay_array_to_sa_mux_lev_1_decoder;
    double delay_array_to_sa_mux_lev_2_decoder;
    double delay_before_subarray_output_driver;
    double delay_from_subarray_out_drv_to_out;
    double access_time;
    double precharge_delay;
    double multisubbank_interleave_cycle_time;
 };
 #endif
--- a/ext/mcpat/cacti/wire.cc
+++ b/ext/mcpat/cacti/wire.cc
@ -0,0 +1,832 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include "wire.h"
 #include "cmath"
 // use this constructor to calculate wire stats
 Wire::Wire(
    enum Wire_type wire_model,
    double wl,
    int n,
    double w_s,
    double s_s,
    enum Wire_placement wp,
    double resistivity,
    TechnologyParameter::DeviceType *dt
    ):wt(wire_model), wire_length(wl*1e-6), nsense(n), w_scale(w_s), s_scale(s_s),
    resistivity(resistivity), deviceType(dt)
 {
  wire_placement = wp;
  min_w_pmos     = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
  in_rise_time   = 0;
  out_rise_time  = 0;
  if (initialized != 1) {
    cout << "Wire not initialized. Initializing it with default values\n";
    Wire winit;
  }
  calculate_wire_stats();
  // change everything back to seconds, microns, and Joules
  repeater_spacing *= 1e6;
  wire_length      *= 1e6;
  wire_width       *= 1e6;
  wire_spacing     *= 1e6;
  assert(wire_length > 0);
  assert(power.readOp.dynamic > 0);
  assert(power.readOp.leakage > 0);
  assert(power.readOp.gate_leakage > 0);
 }
    // the following values are for peripheral global technology
    // specified in the input config file
    Component Wire::global;
    Component Wire::global_5;
    Component Wire::global_10;
    Component Wire::global_20;
    Component Wire::global_30;
    Component Wire::low_swing;
    int Wire::initialized;
    double Wire::wire_width_init;
    double Wire::wire_spacing_init;
 Wire::Wire(double w_s, double s_s, enum Wire_placement wp, double resis, TechnologyParameter::DeviceType *dt)
 {
  w_scale        = w_s;
  s_scale        = s_s;
  deviceType     = dt;
  wire_placement = wp;
  resistivity    = resis;
  min_w_pmos     = deviceType->n_to_p_eff_curr_drv_ratio * g_tp.min_w_nmos_;
  in_rise_time   = 0;
  out_rise_time  = 0;
  switch (wire_placement)
  {
    case outside_mat: wire_width = g_tp.wire_outside_mat.pitch; break;
    case inside_mat : wire_width = g_tp.wire_inside_mat.pitch;  break;
    default:          wire_width = g_tp.wire_local.pitch; break;
  }
  wire_spacing = wire_width;
  wire_width   *= (w_scale * 1e-6/2) /* (m) */;
  wire_spacing *= (s_scale * 1e-6/2) /* (m) */;
  initialized = 1;
  init_wire();
  wire_width_init = wire_width;
  wire_spacing_init = wire_spacing;
  assert(power.readOp.dynamic > 0);
  assert(power.readOp.leakage > 0);
  assert(power.readOp.gate_leakage > 0);
 }
 Wire::~Wire()
 {
 }
 void
 Wire::calculate_wire_stats()
 {
  if (wire_placement == outside_mat) {
    wire_width = g_tp.wire_outside_mat.pitch;
  }
  else if (wire_placement == inside_mat) {
    wire_width = g_tp.wire_inside_mat.pitch;
  }
  else {
    wire_width = g_tp.wire_local.pitch;
  }
  wire_spacing = wire_width;
  wire_width   *= (w_scale * 1e-6/2) /* (m) */;
  wire_spacing *= (s_scale * 1e-6/2) /* (m) */;
  if (wt != Low_swing) {
          //    delay_optimal_wire();
          if (wt == Global) {
                  delay = global.delay * wire_length;
                  power.readOp.dynamic = global.power.readOp.dynamic * wire_length;
                  power.readOp.leakage = global.power.readOp.leakage * wire_length;
                  power.readOp.gate_leakage = global.power.readOp.gate_leakage * wire_length;
                  repeater_spacing = global.area.w;
                  repeater_size = global.area.h;
                  area.set_area((wire_length/repeater_spacing) *
                                  compute_gate_area(INV, 1, min_w_pmos * repeater_size,
                                                  g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
          }
          else if (wt == Global_5) {
                  delay = global_5.delay * wire_length;
                  power.readOp.dynamic = global_5.power.readOp.dynamic * wire_length;
                  power.readOp.leakage = global_5.power.readOp.leakage * wire_length;
                  power.readOp.gate_leakage = global_5.power.readOp.gate_leakage * wire_length;
                  repeater_spacing = global_5.area.w;
                  repeater_size = global_5.area.h;
                  area.set_area((wire_length/repeater_spacing) *
                                  compute_gate_area(INV, 1, min_w_pmos * repeater_size,
                                                  g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
          }
          else if (wt == Global_10) {
                  delay = global_10.delay * wire_length;
                  power.readOp.dynamic = global_10.power.readOp.dynamic * wire_length;
                  power.readOp.leakage = global_10.power.readOp.leakage * wire_length;
                  power.readOp.gate_leakage = global_10.power.readOp.gate_leakage * wire_length;
                  repeater_spacing = global_10.area.w;
                  repeater_size = global_10.area.h;
                  area.set_area((wire_length/repeater_spacing) *
                                  compute_gate_area(INV, 1, min_w_pmos * repeater_size,
                                                  g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
          }
          else if (wt == Global_20) {
                  delay = global_20.delay * wire_length;
                  power.readOp.dynamic = global_20.power.readOp.dynamic * wire_length;
                  power.readOp.leakage = global_20.power.readOp.leakage * wire_length;
                  power.readOp.gate_leakage = global_20.power.readOp.gate_leakage * wire_length;
                  repeater_spacing = global_20.area.w;
                  repeater_size = global_20.area.h;
                  area.set_area((wire_length/repeater_spacing) *
                                  compute_gate_area(INV, 1, min_w_pmos * repeater_size,
                                                  g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
          }
          else if (wt == Global_30) {
                  delay = global_30.delay * wire_length;
                  power.readOp.dynamic = global_30.power.readOp.dynamic * wire_length;
                  power.readOp.leakage = global_30.power.readOp.leakage * wire_length;
                  power.readOp.gate_leakage = global_30.power.readOp.gate_leakage * wire_length;
                  repeater_spacing = global_30.area.w;
                  repeater_size = global_30.area.h;
                  area.set_area((wire_length/repeater_spacing) *
                                  compute_gate_area(INV, 1, min_w_pmos * repeater_size,
                                                  g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
          }
    out_rise_time = delay*repeater_spacing/deviceType->Vth;
  }
  else if (wt == Low_swing) {
    low_swing_model ();
    repeater_spacing = wire_length;
    repeater_size = 1;
  }
  else {
    assert(0);
  }
 }
 /*
 * The fall time of an input signal to the first stage of a circuit is
 * assumed to be same as the fall time of the output signal of two
 * inverters connected in series (refer: CACTI 1 Technical report,
 * section 6.1.3)
 */
  double
 Wire::signal_fall_time ()
 {
  /* rise time of inverter 1's output */
  double rt;
  /* fall time of inverter 2's output */
  double ft;
  double timeconst;
  timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
      drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
      gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) *
    tr_R_on(min_w_pmos, PCH, 1);
  rt = horowitz (0, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, FALL) / (deviceType->Vdd - deviceType->Vth);
  timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
      drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
      gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) *
    tr_R_on(g_tp.min_w_nmos_, NCH, 1);
  ft = horowitz (rt, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE) / deviceType->Vth;
  return ft;
 }
 double Wire::signal_rise_time ()
 {
  /* rise time of inverter 1's output */
  double ft;
  /* fall time of inverter 2's output */
  double rt;
  double timeconst;
  timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
      drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
      gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) *
    tr_R_on(g_tp.min_w_nmos_, NCH, 1);
  rt = horowitz (0, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE) / deviceType->Vth;
  timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
      drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
      gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) *
    tr_R_on(min_w_pmos, PCH, 1);
  ft = horowitz (rt, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, FALL) / (deviceType->Vdd - deviceType->Vth);
  return ft; //sec
 }
 /* Wire resistance and capacitance calculations
 *   wire width
 *
 *    /__/
 *   |  |
 *   |  |  height = ASPECT_RATIO*wire width (ASPECT_RATIO = 2.2, ref: ITRS)
 *   |__|/
 *
 *   spacing between wires in same level = wire width
 *   spacing between wires in adjacent levels = wire width---this is incorrect,
 *   according to R.Ho's paper and thesis. ILD != wire width
 *
 */
 double Wire::wire_cap (double len /* in m */, bool call_from_outside)
 {
        //TODO: this should be consistent with the wire_res in technology file
  double sidewall, adj, tot_cap;
  double wire_height;
  double epsilon0 = 8.8542e-12;
  double aspect_ratio, horiz_dielectric_constant, vert_dielectric_constant, miller_value,ild_thickness;
  switch (wire_placement)
  {
    case outside_mat:
        {
                aspect_ratio = g_tp.wire_outside_mat.aspect_ratio;
                horiz_dielectric_constant = g_tp.wire_outside_mat.horiz_dielectric_constant;
                vert_dielectric_constant = g_tp.wire_outside_mat.vert_dielectric_constant;
                miller_value = g_tp.wire_outside_mat.miller_value;
                ild_thickness = g_tp.wire_outside_mat.ild_thickness;
                break;
        }
    case inside_mat :
        {
                aspect_ratio = g_tp.wire_inside_mat.aspect_ratio;
                horiz_dielectric_constant = g_tp.wire_inside_mat.horiz_dielectric_constant;
                vert_dielectric_constant = g_tp.wire_inside_mat.vert_dielectric_constant;
                miller_value = g_tp.wire_inside_mat.miller_value;
                ild_thickness = g_tp.wire_inside_mat.ild_thickness;
                break;
        }
    default:
        {
                aspect_ratio = g_tp.wire_local.aspect_ratio;
                horiz_dielectric_constant = g_tp.wire_local.horiz_dielectric_constant;
                vert_dielectric_constant = g_tp.wire_local.vert_dielectric_constant;
                miller_value = g_tp.wire_local.miller_value;
                ild_thickness = g_tp.wire_local.ild_thickness;
                break;
        }
  }
  if (call_from_outside)
  {
          wire_width       *= 1e-6;
          wire_spacing     *= 1e-6;
  }
  wire_height = wire_width/w_scale*aspect_ratio;
  /*
   * assuming height does not change. wire_width = width_original*w_scale
   * So wire_height does not change as wire width increases
   */
 // capacitance between wires in the same level
 //  sidewall = 2*miller_value * horiz_dielectric_constant * (wire_height/wire_spacing)
 //    * epsilon0;
  sidewall = miller_value * horiz_dielectric_constant * (wire_height/wire_spacing)
    * epsilon0;
  // capacitance between wires in adjacent levels
  //adj = miller_value * vert_dielectric_constant *w_scale * epsilon0;
  //adj = 2*vert_dielectric_constant *wire_width/(ild_thickness*1e-6) * epsilon0;
  adj = miller_value *vert_dielectric_constant *wire_width/(ild_thickness*1e-6) * epsilon0;
  //Change ild_thickness from micron to M
  //tot_cap =  (sidewall + adj + (deviceType->C_fringe * 1e6)); //F/m
  tot_cap =  (sidewall + adj + (g_tp.fringe_cap * 1e6)); //F/m
  if (call_from_outside)
  {
          wire_width       *= 1e6;
          wire_spacing     *= 1e6;
  }
  return (tot_cap*len); // (F)
 }
  double
 Wire::wire_res (double len /*(in m)*/)
 {
          double aspect_ratio,alpha_scatter =1.05, dishing_thickness=0, barrier_thickness=0;
          //TODO: this should be consistent with the wire_res in technology file
          //The whole computation should be consistent with the wire_res in technology.cc too!
          switch (wire_placement)
          {
          case outside_mat:
          {
                  aspect_ratio = g_tp.wire_outside_mat.aspect_ratio;
                  break;
          }
          case inside_mat :
          {
                  aspect_ratio = g_tp.wire_inside_mat.aspect_ratio;
                  break;
          }
          default:
          {
                  aspect_ratio = g_tp.wire_local.aspect_ratio;
                  break;
          }
          }
          return (alpha_scatter * resistivity * 1e-6 * len/((aspect_ratio*wire_width/w_scale-dishing_thickness - barrier_thickness)*
                          (wire_width-2*barrier_thickness)));
 }
 /*
 * Calculates the delay, power and area of the transmitter circuit.
 *
 * The transmitter delay is the sum of nand gate delay, inverter delay
 * low swing nmos delay, and the wire delay
 * (ref: Technical report 6)
 */
  void
 Wire::low_swing_model()
 {
  double len = wire_length;
  double beta = pmos_to_nmos_sz_ratio();
  double inputrise = (in_rise_time == 0) ? signal_rise_time() : in_rise_time;
  /* Final nmos low swing driver size calculation:
   * Try to size the driver such that the delay
   * is less than 8FO4.
   * If the driver size is greater than
   * the max allowable size, assume max size for the driver.
   * In either case, recalculate the delay using
   * the final driver size assuming slow input with
   * finite rise time instead of ideal step input
   *
   * (ref: Technical report 6)
   */
  double cwire = wire_cap(len); /* load capacitance */
  double rwire = wire_res(len);
 #define RES_ADJ (8.6) // Increase in resistance due to low driving vol.
  double driver_res = (-8*g_tp.FO4/(log(0.5) * cwire))/RES_ADJ;
  double nsize = R_to_w(driver_res, NCH);
  nsize = MIN(nsize, g_tp.max_w_nmos_);
  nsize = MAX(nsize, g_tp.min_w_nmos_);
  if(rwire*cwire > 8*g_tp.FO4)
  {
    nsize = g_tp.max_w_nmos_;
  }
  // size the inverter appropriately to minimize the transmitter delay
  // Note - In order to minimize leakage, we are not adding a set of inverters to
  // bring down delay. Instead, we are sizing the single gate
  // based on the logical effort.
  double st_eff   = sqrt((2+beta/1+beta)*gate_C(nsize, 0)/(gate_C(2*g_tp.min_w_nmos_, 0)
        + gate_C(2*min_w_pmos, 0)));
  double req_cin  = ((2+beta/1+beta)*gate_C(nsize, 0))/st_eff;
  double inv_size = req_cin/(gate_C(min_w_pmos, 0) + gate_C(g_tp.min_w_nmos_, 0));
  inv_size = MAX(inv_size, 1);
  /* nand gate delay */
  double res_eq = (2 * tr_R_on(g_tp.min_w_nmos_, NCH, 1));
  double cap_eq = 2 * drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
    drain_C_(2*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
    gate_C(inv_size*g_tp.min_w_nmos_, 0) +
    gate_C(inv_size*min_w_pmos, 0);
  double timeconst = res_eq * cap_eq;
  delay = horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd,
      deviceType->Vth/deviceType->Vdd, RISE);
  double temp_power = cap_eq*deviceType->Vdd*deviceType->Vdd;
  inputrise = delay / (deviceType->Vdd - deviceType->Vth); /* for the next stage */
  /* Inverter delay:
   * The load capacitance of this inv depends on
   * the gate capacitance of the final stage nmos
   * transistor which in turn depends on nsize
   */
  res_eq = tr_R_on(inv_size*min_w_pmos, PCH, 1);
  cap_eq = drain_C_(inv_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
    drain_C_(inv_size*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
    gate_C(nsize, 0);
  timeconst = res_eq * cap_eq;
  delay += horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd,
      deviceType->Vth/deviceType->Vdd, FALL);
  temp_power += cap_eq*deviceType->Vdd*deviceType->Vdd;
  transmitter.delay = delay;
  transmitter.power.readOp.dynamic = temp_power*2; /* since it is a diff. model*/
  transmitter.power.readOp.leakage = deviceType->Vdd *
    (4 * cmos_Isub_leakage(g_tp.min_w_nmos_, min_w_pmos, 2, nand) +
     4 * cmos_Isub_leakage(g_tp.min_w_nmos_, min_w_pmos, 1, inv));
  transmitter.power.readOp.gate_leakage = deviceType->Vdd *
    (4 * cmos_Ig_leakage(g_tp.min_w_nmos_, min_w_pmos, 2, nand) +
     4 * cmos_Ig_leakage(g_tp.min_w_nmos_, min_w_pmos, 1, inv));
  inputrise = delay / deviceType->Vth;
  /* nmos delay + wire delay */
  cap_eq = cwire + drain_C_(nsize, NCH, 1, 1, g_tp.cell_h_def)*2 +
    nsense * sense_amp_input_cap(); //+receiver cap
  /*
   * NOTE: nmos is used as both pull up and pull down transistor
   * in the transmitter. This is because for low voltage swing, drive
   * resistance of nmos is less than pmos
   * (for a detailed graph ref: On-Chip Wires: Scaling and Efficiency)
   */
  timeconst = (tr_R_on(nsize, NCH, 1)*RES_ADJ) * (cwire +
      drain_C_(nsize, NCH, 1, 1, g_tp.cell_h_def)*2) +
    rwire*cwire/2 +
    (tr_R_on(nsize, NCH, 1)*RES_ADJ + rwire) *
    nsense * sense_amp_input_cap();
  /*
   * since we are pre-equalizing and overdriving the low
   * swing wires, the net time constant is less
   * than the actual value
   */
  delay += horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd, .25, 0);
 #define VOL_SWING .1
  temp_power += cap_eq*VOL_SWING*.400; /* .4v is the over drive voltage */
  temp_power *= 2; /* differential wire */
  l_wire.delay = delay - transmitter.delay;
  l_wire.power.readOp.dynamic = temp_power - transmitter.power.readOp.dynamic;
  l_wire.power.readOp.leakage = deviceType->Vdd*
    (4* cmos_Isub_leakage(nsize, 0, 1, nmos));
  l_wire.power.readOp.gate_leakage = deviceType->Vdd*
    (4* cmos_Ig_leakage(nsize, 0, 1, nmos));
  //double rt = horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd,
  //    deviceType->Vth/deviceType->Vdd, RISE)/deviceType->Vth;
  delay += g_tp.sense_delay;
  sense_amp.delay = g_tp.sense_delay;
  out_rise_time = g_tp.sense_delay/(deviceType->Vth);
  sense_amp.power.readOp.dynamic = g_tp.sense_dy_power;
  sense_amp.power.readOp.leakage = 0; //FIXME
  sense_amp.power.readOp.gate_leakage = 0;
  power.readOp.dynamic = temp_power + sense_amp.power.readOp.dynamic;
  power.readOp.leakage = transmitter.power.readOp.leakage +
                         l_wire.power.readOp.leakage +
                         sense_amp.power.readOp.leakage;
  power.readOp.gate_leakage = transmitter.power.readOp.gate_leakage +
                         l_wire.power.readOp.gate_leakage +
                         sense_amp.power.readOp.gate_leakage;
 }
  double
 Wire::sense_amp_input_cap()
 {
  return drain_C_(g_tp.w_iso, PCH, 1, 1, g_tp.cell_h_def) +
    gate_C(g_tp.w_sense_en + g_tp.w_sense_n, 0) +
    drain_C_(g_tp.w_sense_n, NCH, 1, 1, g_tp.cell_h_def) +
    drain_C_(g_tp.w_sense_p, PCH, 1, 1, g_tp.cell_h_def);
 }
 void Wire::delay_optimal_wire ()
 {
  double len       = wire_length;
  //double min_wire_width = wire_width; //m
  double beta = pmos_to_nmos_sz_ratio();
  double switching = 0;  // switching energy
  double short_ckt = 0;  // short-circuit energy
  double tc        = 0;  // time constant
  // input cap of min sized driver
  double input_cap = gate_C(g_tp.min_w_nmos_ + min_w_pmos, 0);
   // output parasitic capacitance of
   // the min. sized driver
  double out_cap = drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
    drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def);
  // drive resistance
  double out_res = (tr_R_on(g_tp.min_w_nmos_, NCH, 1) +
      tr_R_on(min_w_pmos, PCH, 1))/2;
  double wr = wire_res(len); //ohm
  // wire cap /m
  double wc = wire_cap(len);
  // size the repeater such that the delay of the wire is minimum
  double repeater_scaling = sqrt(out_res*wc/(wr*input_cap)); // len will cancel
   // calc the optimum spacing between the repeaters (m)
  repeater_spacing = sqrt(2 * out_res * (out_cap + input_cap)/
      ((wr/len)*(wc/len)));
  repeater_size = repeater_scaling;
  switching = (repeater_scaling * (input_cap + out_cap) +
      repeater_spacing * (wc/len)) * deviceType->Vdd * deviceType->Vdd;
  tc = out_res * (input_cap + out_cap) +
    out_res * wc/len * repeater_spacing/repeater_scaling +
    wr/len * repeater_spacing * input_cap * repeater_scaling +
    0.5 * (wr/len) * (wc/len)* repeater_spacing * repeater_spacing;
  delay = 0.693 * tc * len/repeater_spacing;
 #define Ishort_ckt 65e-6 /* across all tech Ref:Banerjee et al. {IEEE TED} */
  short_ckt = deviceType->Vdd * g_tp.min_w_nmos_ * Ishort_ckt * 1.0986 *
    repeater_scaling * tc;
  area.set_area((len/repeater_spacing) *
                compute_gate_area(INV, 1, min_w_pmos * repeater_scaling,
                                          g_tp.min_w_nmos_ * repeater_scaling, g_tp.cell_h_def));
  power.readOp.dynamic = ((len/repeater_spacing)*(switching + short_ckt));
  power.readOp.leakage = ((len/repeater_spacing)*
      deviceType->Vdd*
      cmos_Isub_leakage(g_tp.min_w_nmos_*repeater_scaling, beta*g_tp.min_w_nmos_*repeater_scaling, 1, inv));
  power.readOp.gate_leakage = ((len/repeater_spacing)*
      deviceType->Vdd*
      cmos_Ig_leakage(g_tp.min_w_nmos_*repeater_scaling, beta*g_tp.min_w_nmos_*repeater_scaling, 1, inv));
 }
 // calculate power/delay values for wires with suboptimal repeater sizing/spacing
 void
 Wire::init_wire(){
  wire_length = 1;
  delay_optimal_wire();
    double sp, si;
  powerDef pow;
  si = repeater_size;
  sp = repeater_spacing;
  sp *= 1e6; // in microns
  double i, j, del;
  repeated_wire.push_back(Component());
  for (j=sp; j < 4*sp; j+=100) {
    for (i = si; i > 1; i--) {
      pow = wire_model(j*1e-6, i, &del);
      if (j == sp && i == si) {
        global.delay = del;
        global.power = pow;
        global.area.h = si;
        global.area.w = sp*1e-6; // m
      }
 //      cout << "Repeater size - "<< i <<
 //        " Repeater spacing - " << j <<
 //        " Delay - " << del <<
 //        " PowerD - " << pow.readOp.dynamic <<
 //        " PowerL - " << pow.readOp.leakage <<endl;
      repeated_wire.back().delay = del;
      repeated_wire.back().power.readOp = pow.readOp;
      repeated_wire.back().area.w = j*1e-6; //m
      repeated_wire.back().area.h = i;
      repeated_wire.push_back(Component());
    }
  }
  repeated_wire.pop_back();
  update_fullswing();
  Wire *l_wire = new Wire(Low_swing, 0.001/* 1 mm*/, 1);
  low_swing.delay = l_wire->delay;
  low_swing.power = l_wire->power;
  delete l_wire;
 }
 void Wire::update_fullswing()
 {
  list<Component>::iterator citer;
  double del[4];
  del[3] = this->global.delay + this->global.delay*.3;
  del[2] = global.delay + global.delay*.2;
  del[1] = global.delay + global.delay*.1;
  del[0] = global.delay + global.delay*.05;
  double threshold;
  double ncost;
  double cost;
  int i = 4;
  while (i>0) {
    threshold = del[i-1];
    cost = BIGNUM;
    for (citer = repeated_wire.begin(); citer != repeated_wire.end(); citer++)
    {
      if (citer->delay > threshold) {
        citer = repeated_wire.erase(citer);
        citer --;
      }
      else {
        ncost = citer->power.readOp.dynamic/global.power.readOp.dynamic +
                citer->power.readOp.leakage/global.power.readOp.leakage;
        if(ncost < cost)
        {
          cost = ncost;
          if (i == 4) {
            global_30.delay = citer->delay;
            global_30.power = citer->power;
            global_30.area  = citer->area;
          }
          else if (i==3) {
            global_20.delay = citer->delay;
            global_20.power = citer->power;
            global_20.area  = citer->area;
          }
          else if(i==2) {
            global_10.delay = citer->delay;
            global_10.power = citer->power;
            global_10.area  = citer->area;
          }
          else if(i==1) {
            global_5.delay = citer->delay;
            global_5.power = citer->power;
            global_5.area  = citer->area;
          }
        }
      }
    }
    i--;
  }
 }
 powerDef Wire::wire_model (double space, double size, double *delay)
 {
  powerDef ptemp;
  double len = 1;
  //double min_wire_width = wire_width; //m
  double beta = pmos_to_nmos_sz_ratio();
  // switching energy
  double switching = 0;
  // short-circuit energy
  double short_ckt = 0;
  // time constant
  double tc = 0;
  // input cap of min sized driver
  double input_cap = gate_C (g_tp.min_w_nmos_ +
      min_w_pmos, 0);
   // output parasitic capacitance of
   // the min. sized driver
  double out_cap = drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
    drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def);
  // drive resistance
  double out_res = (tr_R_on(g_tp.min_w_nmos_, NCH, 1) +
      tr_R_on(min_w_pmos, PCH, 1))/2;
  double wr = wire_res(len); //ohm
  // wire cap /m
  double wc = wire_cap(len);
  repeater_spacing = space;
  repeater_size = size;
  switching = (repeater_size * (input_cap + out_cap) +
      repeater_spacing * (wc/len)) * deviceType->Vdd * deviceType->Vdd;
  tc = out_res * (input_cap + out_cap) +
    out_res * wc/len * repeater_spacing/repeater_size +
    wr/len * repeater_spacing * out_cap * repeater_size +
    0.5 * (wr/len) * (wc/len)* repeater_spacing * repeater_spacing;
  *delay = 0.693 * tc * len/repeater_spacing;
 #define Ishort_ckt 65e-6 /* across all tech Ref:Banerjee et al. {IEEE TED} */
  short_ckt = deviceType->Vdd * g_tp.min_w_nmos_ * Ishort_ckt * 1.0986 *
    repeater_size * tc;
  ptemp.readOp.dynamic = ((len/repeater_spacing)*(switching + short_ckt));
  ptemp.readOp.leakage = ((len/repeater_spacing)*
      deviceType->Vdd*
      cmos_Isub_leakage(g_tp.min_w_nmos_*repeater_size, beta*g_tp.min_w_nmos_*repeater_size, 1, inv));
  ptemp.readOp.gate_leakage = ((len/repeater_spacing)*
      deviceType->Vdd*
      cmos_Ig_leakage(g_tp.min_w_nmos_*repeater_size, beta*g_tp.min_w_nmos_*repeater_size, 1, inv));
  return ptemp;
 }
 void
 Wire::print_wire()
 {
  cout << "\nWire Properties:\n\n";
  cout << "  Delay Optimal\n\tRepeater size - "<< global.area.h <<
    " \n\tRepeater spacing - " << global.area.w*1e3 << " (mm)"
    " \n\tDelay - " << global.delay*1e6 <<  " (ns/mm)"
    " \n\tPowerD - " << global.power.readOp.dynamic *1e6<< " (nJ/mm)"
    " \n\tPowerL - " << global.power.readOp.leakage << " (mW/mm)"
    " \n\tPowerLgate - " << global.power.readOp.gate_leakage << " (mW/mm)\n";
  cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
  cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
  cout <<endl;
  cout << "  5% Overhead\n\tRepeater size - "<< global_5.area.h <<
    " \n\tRepeater spacing - " << global_5.area.w*1e3 << " (mm)"
    " \n\tDelay - " << global_5.delay *1e6<<  " (ns/mm)"
    " \n\tPowerD - " << global_5.power.readOp.dynamic *1e6<< " (nJ/mm)"
    " \n\tPowerL - " << global_5.power.readOp.leakage << " (mW/mm)"
    " \n\tPowerLgate - " << global_5.power.readOp.gate_leakage << " (mW/mm)\n";
  cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
  cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
  cout <<endl;
  cout << "  10% Overhead\n\tRepeater size - "<< global_10.area.h <<
    " \n\tRepeater spacing - " << global_10.area.w*1e3 << " (mm)"
    " \n\tDelay - " << global_10.delay *1e6<<  " (ns/mm)"
    " \n\tPowerD - " << global_10.power.readOp.dynamic *1e6<< " (nJ/mm)"
    " \n\tPowerL - " << global_10.power.readOp.leakage << " (mW/mm)"
    " \n\tPowerLgate - " << global_10.power.readOp.gate_leakage << " (mW/mm)\n";
  cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
  cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
  cout <<endl;
  cout << "  20% Overhead\n\tRepeater size - "<< global_20.area.h <<
    " \n\tRepeater spacing - " << global_20.area.w*1e3 << " (mm)"
    " \n\tDelay - " << global_20.delay *1e6<<  " (ns/mm)"
    " \n\tPowerD - " << global_20.power.readOp.dynamic *1e6<< " (nJ/mm)"
    " \n\tPowerL - " << global_20.power.readOp.leakage << " (mW/mm)"
    " \n\tPowerLgate - " << global_20.power.readOp.gate_leakage << " (mW/mm)\n";
  cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
  cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
  cout <<endl;
  cout << "  30% Overhead\n\tRepeater size - "<< global_30.area.h <<
    " \n\tRepeater spacing - " << global_30.area.w*1e3 << " (mm)"
    " \n\tDelay - " << global_30.delay *1e6<<  " (ns/mm)"
    " \n\tPowerD - " << global_30.power.readOp.dynamic *1e6<< " (nJ/mm)"
    " \n\tPowerL - " << global_30.power.readOp.leakage << " (mW/mm)"
    " \n\tPowerLgate - " << global_30.power.readOp.gate_leakage << " (mW/mm)\n";
  cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
  cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
  cout <<endl;
  cout << "  Low-swing wire (1 mm) - Note: Unlike repeated wires, \n\tdelay and power "
            "values of low-swing wires do not\n\thave a linear relationship with length." <<
      " \n\tdelay - " << low_swing.delay *1e9<<  " (ns)"
      " \n\tpowerD - " << low_swing.power.readOp.dynamic *1e9<< " (nJ)"
      " \n\tPowerL - " << low_swing.power.readOp.leakage << " (mW)"
      " \n\tPowerLgate - " << low_swing.power.readOp.gate_leakage << " (mW)\n";
  cout << "\tWire width - " <<wire_width_init * 2 /* differential */<< " microns\n";
  cout << "\tWire spacing - " <<wire_spacing_init * 2 /* differential */<< " microns\n";
  cout <<endl;
  cout <<endl;
 }
--- a/ext/mcpat/cacti/wire.h
+++ b/ext/mcpat/cacti/wire.h
@ -0,0 +1,124 @@
 /*****************************************************************************
 *                                McPAT/CACTI
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __WIRE_H__
 #define __WIRE_H__
 #include <iostream>
 #include <list>
 #include "assert.h"
 #include "basic_circuit.h"
 #include "cacti_interface.h"
 #include "component.h"
 #include "parameter.h"
 class Wire : public Component
 {
  public:
    Wire(enum Wire_type wire_model, double len /* in u*/,
         int nsense = 1/* no. of sense amps connected to the low-swing wire */,
         double width_scaling = 1,
         double spacing_scaling = 1,
         enum Wire_placement wire_placement = outside_mat,
         double resistivity = CU_RESISTIVITY,
         TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
    ~Wire();
    Wire( double width_scaling = 1,
         double spacing_scaling = 1,
         enum Wire_placement wire_placement = outside_mat,
         double resistivity = CU_RESISTIVITY,
         TechnologyParameter::DeviceType *dt = &(g_tp.peri_global)
    ); // should be used only once for initializing static members
    void init_wire();
    void calculate_wire_stats();
    void delay_optimal_wire();
    double wire_cap(double len, bool call_from_outside=false);
    double wire_res(double len);
    void low_swing_model();
    double signal_fall_time();
    double signal_rise_time();
    double sense_amp_input_cap();
    enum Wire_type wt;
    double wire_spacing;
    double wire_width;
    enum Wire_placement wire_placement;
    double repeater_size;
    double repeater_spacing;
    double wire_length;
    double in_rise_time, out_rise_time;
    void set_in_rise_time(double rt)
    {
      in_rise_time = rt;
    }
    static Component global;
    static Component global_5;
    static Component global_10;
    static Component global_20;
    static Component global_30;
    static Component low_swing;
    static double wire_width_init;
    static double wire_spacing_init;
    void print_wire();
  private:
    int nsense; // no. of sense amps connected to a low-swing wire if it
                // is broadcasting data to multiple destinations
    // width and spacing scaling factor can be used
    // to model low level wires or special
    // fat wires
    double w_scale, s_scale;
    double resistivity;
    powerDef wire_model (double space, double size, double *delay);
    list <Component> repeated_wire;
    void update_fullswing();
    static int initialized;
    //low-swing
    Component transmitter;
    Component l_wire;
    Component sense_amp;
    double min_w_pmos;
    TechnologyParameter::DeviceType *deviceType;
 };
 #endif
--- a/ext/mcpat/core.cc
+++ b/ext/mcpat/core.cc
--- a/ext/mcpat/core.h
+++ b/ext/mcpat/core.h
@ -0,0 +1,262 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef CORE_H_
 #define CORE_H_
 #include "XML_Parse.h"
 #include "array.h"
 #include "basic_components.h"
 #include "interconnect.h"
 #include "logic.h"
 #include "parameter.h"
 #include "sharedcache.h"
 class BranchPredictor :public Component {
  public:
        ParseXML *XML;
        int  ithCore;
        InputParameter interface_ip;
        CoreDynParam  coredynp;
        double clockRate,executionTime;
        double scktRatio, chip_PR_overhead, macro_PR_overhead;
        ArrayST * globalBPT;
        ArrayST * localBPT;
        ArrayST * L1_localBPT;
        ArrayST * L2_localBPT;
        ArrayST * chooser;
        ArrayST * RAS;
        bool exist;
        BranchPredictor(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exsit=true);
    void computeEnergy(bool is_tdp=true);
    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
        ~BranchPredictor();
 };
 class InstFetchU :public Component {
  public:
        ParseXML *XML;
        int  ithCore;
        InputParameter interface_ip;
        CoreDynParam  coredynp;
        double clockRate,executionTime;
        double scktRatio, chip_PR_overhead, macro_PR_overhead;
        enum Cache_policy cache_p;
        InstCache icache;
        ArrayST * IB;
        ArrayST * BTB;
        BranchPredictor * BPT;
        inst_decoder * ID_inst;
        inst_decoder * ID_operand;
        inst_decoder * ID_misc;
        bool exist;
        InstFetchU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exsit=true);
    void computeEnergy(bool is_tdp=true);
    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
        ~InstFetchU();
 };
 class SchedulerU :public Component {
  public:
        ParseXML *XML;
        int  ithCore;
        InputParameter interface_ip;
        CoreDynParam  coredynp;
        double clockRate,executionTime;
        double scktRatio, chip_PR_overhead, macro_PR_overhead;
        double Iw_height, fp_Iw_height,ROB_height;
        ArrayST         * int_inst_window;
        ArrayST         * fp_inst_window;
        ArrayST         * ROB;
    selection_logic * instruction_selection;
    bool exist;
    SchedulerU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exist_=true);
    void computeEnergy(bool is_tdp=true);
    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
        ~SchedulerU();
 };
 class RENAMINGU :public Component {
  public:
        ParseXML *XML;
        int  ithCore;
        InputParameter interface_ip;
        double clockRate,executionTime;
        CoreDynParam  coredynp;
        ArrayST * iFRAT;
        ArrayST * fFRAT;
        ArrayST * iRRAT;
        ArrayST * fRRAT;
        ArrayST * ifreeL;
        ArrayST * ffreeL;
        dep_resource_conflict_check * idcl;
        dep_resource_conflict_check * fdcl;
        ArrayST * RAHT;//register alias history table Used to store GC
        bool exist;
        RENAMINGU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_=true);
    void computeEnergy(bool is_tdp=true);
    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
        ~RENAMINGU();
 };
 class LoadStoreU :public Component {
  public:
        ParseXML *XML;
        int  ithCore;
        InputParameter interface_ip;
        CoreDynParam  coredynp;
        enum Cache_policy cache_p;
        double clockRate,executionTime;
        double scktRatio, chip_PR_overhead, macro_PR_overhead;
        double lsq_height;
        DataCache dcache;
        ArrayST * LSQ;//it is actually the store queue but for inorder processors it serves as both loadQ and StoreQ
        ArrayST * LoadQ;
        bool exist;
        LoadStoreU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exist_=true);
    void computeEnergy(bool is_tdp=true);
    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
        ~LoadStoreU();
 };
 class MemManU :public Component {
  public:
        ParseXML *XML;
        int  ithCore;
        InputParameter interface_ip;
        CoreDynParam  coredynp;
        double clockRate,executionTime;
        double scktRatio, chip_PR_overhead, macro_PR_overhead;
        ArrayST * itlb;
        ArrayST * dtlb;
        bool exist;
        MemManU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exist_=true);
    void computeEnergy(bool is_tdp=true);
    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
        ~MemManU();
 };
 class RegFU :public Component {
  public:
        ParseXML *XML;
        int  ithCore;
        InputParameter interface_ip;
        CoreDynParam  coredynp;
        double clockRate,executionTime;
        double scktRatio, chip_PR_overhead, macro_PR_overhead;
        double int_regfile_height, fp_regfile_height;
        ArrayST * IRF;
        ArrayST * FRF;
        ArrayST * RFWIN;
        bool exist;
        RegFU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exist_=true);
    void computeEnergy(bool is_tdp=true);
    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
        ~RegFU();
 };
 class EXECU :public Component {
  public:
        ParseXML *XML;
        int  ithCore;
        InputParameter interface_ip;
        double clockRate,executionTime;
        double scktRatio, chip_PR_overhead, macro_PR_overhead;
        double lsq_height;
        CoreDynParam  coredynp;
        RegFU          * rfu;
        SchedulerU     * scheu;
    FunctionalUnit * fp_u;
    FunctionalUnit * exeu;
    FunctionalUnit * mul;
        interconnect * int_bypass;
        interconnect * intTagBypass;
        interconnect * int_mul_bypass;
        interconnect * intTag_mul_Bypass;
        interconnect * fp_bypass;
        interconnect * fpTagBypass;
        Component  bypass;
        bool exist;
        EXECU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_, double lsq_height_,const CoreDynParam & dyn_p_, bool exist_=true);
    void computeEnergy(bool is_tdp=true);
        void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
        ~EXECU();
 };
 class Core :public Component {
  public:
        ParseXML *XML;
        int  ithCore;
        InputParameter interface_ip;
        double clockRate,executionTime;
        double scktRatio, chip_PR_overhead, macro_PR_overhead;
        InstFetchU * ifu;
        LoadStoreU * lsu;
        MemManU    * mmu;
        EXECU      * exu;
        RENAMINGU  * rnu;
    Pipeline   * corepipe;
    UndiffCore * undiffCore;
    SharedCache * l2cache;
    CoreDynParam  coredynp;
    //full_decoder 	inst_decoder;
    //clock_network	clockNetwork;
        Core(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_);
        void set_core_param();
        void computeEnergy(bool is_tdp=true);
        void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
        ~Core();
 };
 #endif /* CORE_H_ */
--- a/ext/mcpat/globalvar.h
+++ b/ext/mcpat/globalvar.h
@ -0,0 +1,48 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef GLOBALVAR_H_
 #define GLOBALVAR_H_
 #ifdef  GLOBALVAR
 #define EXTERN
 #else
 #define EXTERN extern
 #endif
 EXTERN bool opt_for_clk;
 #endif /* GLOBALVAR_H_ */
--- a/ext/mcpat/interconnect.cc
+++ b/ext/mcpat/interconnect.cc
@ -0,0 +1,222 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <cassert>
 #include <iostream>
 #include "globalvar.h"
 #include "interconnect.h"
 #include "wire.h"
 interconnect::interconnect(
    string name_,
    enum Device_ty device_ty_,
        double base_w, double base_h,
    int data_w, double len,const InputParameter *configure_interface,
    int start_wiring_level_,
    bool pipelinable_ ,
    double route_over_perc_ ,
    bool opt_local_,
    enum Core_type core_ty_,
    enum Wire_type wire_model,
    double width_s, double space_s,
    TechnologyParameter::DeviceType *dt
 )
 :name(name_),
  device_ty(device_ty_),
  in_rise_time(0),
  out_rise_time(0),
  base_width(base_w),
  base_height(base_h),
  data_width(data_w),
  wt(wire_model),
  width_scaling(width_s),
  space_scaling(space_s),
  start_wiring_level(start_wiring_level_),
  length(len),
  //interconnect_latency(1e-12),
  //interconnect_throughput(1e-12),
  opt_local(opt_local_),
  core_ty(core_ty_),
  pipelinable(pipelinable_),
  route_over_perc(route_over_perc_),
  deviceType(dt)
 {
  wt = Global;
  l_ip=*configure_interface;
  local_result = init_interface(&l_ip);
  max_unpipelined_link_delay = 0; //TODO
  min_w_nmos = g_tp.min_w_nmos_;
  min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio * min_w_nmos;
  latency               = l_ip.latency;
  throughput            = l_ip.throughput;
  latency_overflow=false;
  throughput_overflow=false;
  /*
   * TODO: Add wiring option from semi-global to global automatically
   * And directly jump to global if semi-global cannot satisfy timing
   * Fat wires only available for global wires, thus
   * if signal wiring layer starts from semi-global,
   * the next layer up will be global, i.e., semi-global does
   * not have fat wires.
   */
  if (pipelinable == false)
  //Non-pipelinable wires, such as bypass logic, care latency
  {
          compute();
          if (opt_for_clk && opt_local)
          {
                  while (delay > latency && width_scaling<3.0)
                  {
                          width_scaling *= 2;
                          space_scaling *= 2;
                          Wire winit(width_scaling, space_scaling);
                          compute();
                  }
                  if (delay > latency)
                  {
                          latency_overflow=true;
                  }
          }
  }
  else //Pipelinable wires, such as bus, does not care latency but throughput
  {
          /*
           * TODO: Add pipe regs power, area, and timing;
           * Pipelinable wires optimize latency first.
           */
          compute();
          if (opt_for_clk && opt_local)
          {
                  while (delay > throughput && width_scaling<3.0)
                  {
                          width_scaling *= 2;
                          space_scaling *= 2;
                          Wire winit(width_scaling, space_scaling);
                          compute();
                  }
                  if (delay > throughput)
                          // insert pipeline stages
                  {
                          num_pipe_stages = (int)ceil(delay/throughput);
                          assert(num_pipe_stages>0);
                          delay = delay/num_pipe_stages + num_pipe_stages*0.05*delay;
                  }
          }
  }
  power_bit = power;
  power.readOp.dynamic *= data_width;
  power.readOp.leakage *= data_width;
  power.readOp.gate_leakage *= data_width;
  area.set_area(area.get_area()*data_width);
  no_device_under_wire_area.h *= data_width;
  if (latency_overflow==true)
                cout<< "Warning: "<< name <<" wire structure cannot satisfy latency constraint." << endl;
  assert(power.readOp.dynamic > 0);
  assert(power.readOp.leakage > 0);
  assert(power.readOp.gate_leakage > 0);
  double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
  double sckRation = g_tp.sckt_co_eff;
  power.readOp.dynamic *= sckRation;
  power.writeOp.dynamic *= sckRation;
  power.searchOp.dynamic *= sckRation;
  power.readOp.longer_channel_leakage =
          power.readOp.leakage*long_channel_device_reduction;
  if (pipelinable)//Only global wires has the option to choose whether routing over or not
          area.set_area(area.get_area()*route_over_perc + no_device_under_wire_area.get_area()*(1-route_over_perc));
  Wire wreset();
 }
 void
 interconnect::compute()
 {
  Wire *wtemp1 = 0;
  wtemp1 = new Wire(wt, length, 1, width_scaling, space_scaling);
  delay = wtemp1->delay;
  power.readOp.dynamic = wtemp1->power.readOp.dynamic;
  power.readOp.leakage = wtemp1->power.readOp.leakage;
  power.readOp.gate_leakage = wtemp1->power.readOp.gate_leakage;
  area.set_area(wtemp1->area.get_area());
  no_device_under_wire_area.h =  (wtemp1->wire_width + wtemp1->wire_spacing);
  no_device_under_wire_area.w = length;
  if (wtemp1)
   delete wtemp1;
 }
 void interconnect::leakage_feedback(double temperature)
 {
  l_ip.temp = (unsigned int)round(temperature/10.0)*10;
  uca_org_t init_result = init_interface(&l_ip); // init_result is dummy
  compute();
  power_bit = power;
  power.readOp.dynamic *= data_width;
  power.readOp.leakage *= data_width;
  power.readOp.gate_leakage *= data_width;
  assert(power.readOp.dynamic > 0);
  assert(power.readOp.leakage > 0);
  assert(power.readOp.gate_leakage > 0);
  double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
  double sckRation = g_tp.sckt_co_eff;
  power.readOp.dynamic *= sckRation;
  power.writeOp.dynamic *= sckRation;
  power.searchOp.dynamic *= sckRation;
  power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;
 }
--- a/ext/mcpat/interconnect.h
+++ b/ext/mcpat/interconnect.h
@ -0,0 +1,111 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef __INTERCONNECT_H__
 #define __INTERCONNECT_H__
 #include "assert.h"
 #include "basic_circuit.h"
 #include "basic_components.h"
 #include "cacti_interface.h"
 #include "component.h"
 #include "parameter.h"
 #include "subarray.h"
 #include "wire.h"
 // leakge power includes entire htree in a bank (when uca_tree == false)
 // leakge power includes only part to one bank when uca_tree == true
 class interconnect : public Component
 {
  public:
    interconnect(
        string  name_,
        enum Device_ty device_ty_,
        double base_w, double base_h, int data_w, double len,
        const InputParameter *configure_interface, int start_wiring_level_,
        bool pipelinable_ = false,
        double route_over_perc_ =0.5,
        bool opt_local_=true,
        enum Core_type core_ty_=Inorder,
        enum Wire_type wire_model=Global,
        double width_s=1.0, double space_s=1.0,
        TechnologyParameter::DeviceType *dt = &(g_tp.peri_global)
                );
    ~interconnect() {};
    void compute();
        string   name;
        enum Device_ty device_ty;
    double in_rise_time, out_rise_time;
        InputParameter l_ip;
        uca_org_t local_result;
    Area no_device_under_wire_area;
    void set_in_rise_time(double rt)
    {
      in_rise_time = rt;
    }
    void leakage_feedback(double temperature);
    double max_unpipelined_link_delay;
    powerDef power_bit;
    double wire_bw;
    double init_wire_bw;  // bus width at root
    double base_width;
    double base_height;
    int data_width;
    enum Wire_type wt;
    double width_scaling, space_scaling;
    int start_wiring_level;
    double length;
    double min_w_nmos;
    double min_w_pmos;
    double latency, throughput;
    bool  latency_overflow;
    bool  throughput_overflow;
    double  interconnect_latency;
    double  interconnect_throughput;
    bool opt_local;
    enum Core_type core_ty;
    bool pipelinable;
    double route_over_perc;
    int  num_pipe_stages;
  private:
    TechnologyParameter::DeviceType *deviceType;
 };
 #endif
--- a/ext/mcpat/iocontrollers.cc
+++ b/ext/mcpat/iocontrollers.cc
@ -0,0 +1,446 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <iostream>
 #include <string>
 #include "XML_Parse.h"
 #include "basic_circuit.h"
 #include "basic_components.h"
 #include "const.h"
 #include "io.h"
 #include "iocontrollers.h"
 #include "logic.h"
 #include "parameter.h"
 /*
 SUN Niagara 2 I/O power analysis:
 total signal bits: 711
 Total FBDIMM bits: (14+10)*2*8= 384
 PCIe bits:         (8 + 8)*2 = 32
 10Gb NIC:          (4*2+4*2)*2 = 32
 Debug I/Os:        168
 Other I/Os:        711- 32-32 - 384 - 168 = 95
 According to "Implementation of an 8-Core, 64-Thread, Power-Efficient SPARC Server on a Chip"
 90% of I/Os are SerDers (the calucaltion is 384+64/(711-168)=83% about the same as the 90% reported in the paper)
 --> around 80Pins are common I/Os.
 Common I/Os consumes 71mW/Gb/s according to Cadence ChipEstimate @65nm
 Niagara 2 I/O clock is 1/4 of core clock. --> 87pin (<--((711-168)*17%)) * 71mW/Gb/s *0.25*1.4Ghz = 2.17W
 Total dynamic power of FBDIMM, NIC, PCIe = 84*0.132 + 84*0.049*0.132 = 11.14 - 2.17 = 8.98
 Further, if assuming I/O logic power is about 50% of I/Os then Total energy of FBDIMM, NIC, PCIe = 11.14 - 2.17*1.5 = 7.89
 */
 /*
 * A bug in Cadence ChipEstimator: After update the clock rate in the clock tab, a user
 * need to re-select the IP clock (the same clk) and then click Estimate. if not reselect
 * the new clock rate may not be propogate into the IPs.
 *
 */
 NIUController::NIUController(ParseXML *XML_interface,InputParameter* interface_ip_)
 :XML(XML_interface),
 interface_ip(*interface_ip_)
 {
          local_result = init_interface(&interface_ip);
          double frontend_area, phy_area, mac_area, SerDer_area;
      double frontend_dyn, mac_dyn, SerDer_dyn;
      double frontend_gates, mac_gates, SerDer_gates;
          double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
          double NMOS_sizing, PMOS_sizing;
          set_niu_param();
          if (niup.type == 0) //high performance NIU
          {
                  //Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate using 65nm.
                  mac_area = (1.53 + 0.3)/2 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
                  //Area estimation based on average of die photo from Niagara 2, ISSCC "An 800mW 10Gb Ethernet Transceiver in 0.13μm CMOS"
                  //and"A 1.2-V-Only 900-mW 10 Gb Ethernet Transceiver and XAUI Interface With Robust VCO Tuning Technique" Frontend is PCS
                  frontend_area = (9.8 + (6 + 18)*65/130*65/130)/3 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
                  //Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate hard IP @65nm.
                  //SerDer is very hard to scale
                  SerDer_area = (1.39 + 0.36) * (interface_ip.F_sz_um/0.065);//* (interface_ip.F_sz_um/0.065);
                  phy_area = frontend_area + SerDer_area;
                  //total area
                  area.set_area((mac_area + frontend_area + SerDer_area)*1e6);
                  //Power
                  //Cadence ChipEstimate using 65nm (mac, front_end are all energy. E=P*T = P/F = 1.37/1Ghz = 1.37e-9);
                  mac_dyn      = 2.19e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate; //2.19W@1GHz fully active according to Cadence ChipEstimate @65nm
                  //Cadence ChipEstimate using 65nm soft IP;
                  frontend_dyn = 0.27e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate;
                  //according to "A 100mW 9.6Gb/s Transceiver in 90nm CMOS..." ISSCC 2006
                  //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
                  SerDer_dyn   = 0.01*10*sqrt(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
                  SerDer_dyn   /= niup.clockRate;//covert to energy per clock cycle of whole NIU
                  //Cadence ChipEstimate using 65nm
                  mac_gates       = 111700;
                  frontend_gates  = 320000;
                  SerDer_gates    = 200000;
                  NMOS_sizing 	  = 5*g_tp.min_w_nmos_;
                  PMOS_sizing	  = 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
          }
          else
          {//Low power implementations are mostly from Cadence ChipEstimator; Ignore the multiple IP effect
                  // ---When there are multiple IP (same kind or not) selected, Cadence ChipEstimator results are not
                  // a simple summation of all IPs. Ignore this effect
                  mac_area      = 0.24 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
                  frontend_area = 0.1  * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);//Frontend is the PCS layer
                  SerDer_area   = 0.35 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
                  //Compare 130um implementation in "A 1.2-V-Only 900-mW 10 Gb Ethernet Transceiver and XAUI Interface With Robust VCO Tuning Technique"
                  //and the ChipEstimator XAUI PHY hard IP, confirm that even PHY can scale perfectly with the technology
                  //total area
                  area.set_area((mac_area + frontend_area + SerDer_area)*1e6);
                  //Power
                  //Cadence ChipEstimate using 65nm (mac, front_end are all energy. E=P*T = P/F = 1.37/1Ghz = 1.37e-9);
                  mac_dyn      = 1.257e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate; //2.19W@1GHz fully active according to Cadence ChipEstimate @65nm
                  //Cadence ChipEstimate using 65nm soft IP;
                  frontend_dyn = 0.6e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate;
                  //SerDer_dyn is power not energy, scaling from 216mw/10Gb/s @130nm
                  SerDer_dyn   = 0.0216*10*(interface_ip.F_sz_um/0.13)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
                  SerDer_dyn   /= niup.clockRate;//covert to energy per clock cycle of whole NIU
                  mac_gates       = 111700;
                  frontend_gates  = 52000;
                  SerDer_gates    = 199260;
                  NMOS_sizing 	  = g_tp.min_w_nmos_;
                  PMOS_sizing	  = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
          }
          power_t.readOp.dynamic = mac_dyn + frontend_dyn + SerDer_dyn;
          power_t.readOp.leakage = (mac_gates + frontend_gates + frontend_gates)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
          double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
          power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
          power_t.readOp.gate_leakage = (mac_gates + frontend_gates + frontend_gates)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
 }
 void NIUController::computeEnergy(bool is_tdp)
 {
        if (is_tdp)
    {
                power	= power_t;
        power.readOp.dynamic *= niup.duty_cycle;
    }
    else
    {
        rt_power = power_t;
        rt_power.readOp.dynamic *= niup.perc_load;
    }
 }
 void NIUController::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
 {
        string indent_str(indent, ' ');
        string indent_str_next(indent+2, ' ');
        bool long_channel = XML->sys.longer_channel_device;
        if (is_tdp)
        {
                cout << "NIU:" << endl;
                cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
                cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*niup.clockRate  << " W" << endl;
                cout << indent_str<< "Subthreshold Leakage = "
                        << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
                //cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
                cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
                cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic*niup.clockRate << " W" << endl;
                cout<<endl;
        }
        else
        {
        }
 }
 void NIUController::set_niu_param()
 {
          niup.clockRate       = XML->sys.niu.clockrate;
          niup.clockRate       *= 1e6;
          niup.num_units       = XML->sys.niu.number_units;
          niup.duty_cycle      = XML->sys.niu.duty_cycle;
          niup.perc_load       = XML->sys.niu.total_load_perc;
          niup.type            = XML->sys.niu.type;
 //	  niup.executionTime   = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
 }
 PCIeController::PCIeController(ParseXML *XML_interface,InputParameter* interface_ip_)
 :XML(XML_interface),
 interface_ip(*interface_ip_)
 {
          local_result = init_interface(&interface_ip);
          double frontend_area, phy_area, ctrl_area, SerDer_area;
      double ctrl_dyn, frontend_dyn, SerDer_dyn;
      double ctrl_gates,frontend_gates, SerDer_gates;
          double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
          double NMOS_sizing, PMOS_sizing;
          /* Assuming PCIe is bit-slice based architecture
           * This is the reason for /8 in both area and power calculation
           * to get per lane numbers
           */
          set_pcie_param();
          if (pciep.type == 0) //high performance NIU
          {
                  //Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate @ 65nm.
                  ctrl_area = (5.2 + 0.5)/2 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
                  //Area estimation based on average of die photo from Niagara 2, and Cadence ChipEstimate @ 65nm.
                  frontend_area = (5.2 + 0.1)/2 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
                  //Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate hard IP @65nm.
                  //SerDer is very hard to scale
                  SerDer_area = (3.03 + 0.36) * (interface_ip.F_sz_um/0.065);//* (interface_ip.F_sz_um/0.065);
                  phy_area = frontend_area + SerDer_area;
                  //total area
                  //Power
                  //Cadence ChipEstimate using 65nm the controller includes everything: the PHY, the data link and transaction layer
                  ctrl_dyn      = 3.75e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
                  //	  //Cadence ChipEstimate using 65nm soft IP;
                  //	  frontend_dyn = 0.27e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
                  //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
                  SerDer_dyn   = 0.01*4*(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;//PCIe 2.0 max per lane speed is 4Gb/s
                  SerDer_dyn   /= pciep.clockRate;//covert to energy per clock cycle
                  //power_t.readOp.dynamic = (ctrl_dyn)*pciep.num_channels;
                  //Cadence ChipEstimate using 65nm
                  ctrl_gates       = 900000/8*pciep.num_channels;
                  //	  frontend_gates   = 120000/8;
                  //	  SerDer_gates     = 200000/8;
                  NMOS_sizing 	  = 5*g_tp.min_w_nmos_;
                  PMOS_sizing	  = 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
          }
          else
          {
                  ctrl_area = 0.412 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
                  //Area estimation based on average of die photo from Niagara 2, and Cadence ChipEstimate @ 65nm.
          SerDer_area = 0.36 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
                  //total area
                  //Power
                  //Cadence ChipEstimate using 65nm the controller includes everything: the PHY, the data link and transaction layer
                  ctrl_dyn      = 2.21e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
                  //	  //Cadence ChipEstimate using 65nm soft IP;
                  //	  frontend_dyn = 0.27e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
                  //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
                  SerDer_dyn   = 0.01*4*(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;//PCIe 2.0 max per lane speed is 4Gb/s
                  SerDer_dyn   /= pciep.clockRate;//covert to energy per clock cycle
                  //Cadence ChipEstimate using 65nm
                  ctrl_gates       = 200000/8*pciep.num_channels;
                  //	  frontend_gates   = 120000/8;
                  SerDer_gates     = 200000/8*pciep.num_channels;
                  NMOS_sizing 	  = g_tp.min_w_nmos_;
                  PMOS_sizing	  = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
          }
          area.set_area(((ctrl_area + (pciep.withPHY? SerDer_area:0))/8*pciep.num_channels)*1e6);
          power_t.readOp.dynamic = (ctrl_dyn + (pciep.withPHY? SerDer_dyn:0))*pciep.num_channels;
          power_t.readOp.leakage = (ctrl_gates + (pciep.withPHY? SerDer_gates:0))*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
          double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
          power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
          power_t.readOp.gate_leakage = (ctrl_gates + (pciep.withPHY? SerDer_gates:0))*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
 }
 void PCIeController::computeEnergy(bool is_tdp)
 {
        if (is_tdp)
    {
                power	= power_t;
        power.readOp.dynamic *= pciep.duty_cycle;
    }
    else
    {
        rt_power = power_t;
        rt_power.readOp.dynamic *= pciep.perc_load;
    }
 }
 void PCIeController::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
 {
        string indent_str(indent, ' ');
        string indent_str_next(indent+2, ' ');
        bool long_channel = XML->sys.longer_channel_device;
        if (is_tdp)
        {
                cout << "PCIe:" << endl;
                cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
                cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*pciep.clockRate  << " W" << endl;
                cout << indent_str<< "Subthreshold Leakage = "
                        << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
                //cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
                cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
                cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic*pciep.clockRate << " W" << endl;
                cout<<endl;
        }
        else
        {
        }
 }
 void PCIeController::set_pcie_param()
 {
          pciep.clockRate       = XML->sys.pcie.clockrate;
          pciep.clockRate       *= 1e6;
          pciep.num_units       = XML->sys.pcie.number_units;
          pciep.num_channels    = XML->sys.pcie.num_channels;
          pciep.duty_cycle      = XML->sys.pcie.duty_cycle;
          pciep.perc_load       = XML->sys.pcie.total_load_perc;
          pciep.type            = XML->sys.pcie.type;
          pciep.withPHY         = XML->sys.pcie.withPHY;
 //	  pciep.executionTime   = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
 }
 FlashController::FlashController(ParseXML *XML_interface,InputParameter* interface_ip_)
 :XML(XML_interface),
 interface_ip(*interface_ip_)
 {
          local_result = init_interface(&interface_ip);
          double frontend_area, phy_area, ctrl_area, SerDer_area;
      double ctrl_dyn, frontend_dyn, SerDer_dyn;
      double ctrl_gates,frontend_gates, SerDer_gates;
          double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
          double NMOS_sizing, PMOS_sizing;
          /* Assuming PCIe is bit-slice based architecture
           * This is the reason for /8 in both area and power calculation
           * to get per lane numbers
           */
          set_fc_param();
          if (fcp.type == 0) //high performance NIU
          {
                  cout<<"Current McPAT does not support high performance flash contorller since even low power designs are enough for maintain throughput"<<endl;
                  exit(0);
                  NMOS_sizing 	  = 5*g_tp.min_w_nmos_;
                  PMOS_sizing	  = 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
          }
          else
          {
                  ctrl_area   = 0.243 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
                  //Area estimation based on Cadence ChipEstimate @ 65nm: NANDFLASH-CTRL from CAST
          SerDer_area = 0.36/8 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
          //based On PCIe PHY TSMC65GP from Cadence ChipEstimate @ 65nm, it support 8x lanes with each lane
          //speed up to 250MB/s (PCIe1.1x) This is already saturate the 200MB/s of the flash controller core above.
                  ctrl_gates      = 129267;
                  SerDer_gates    = 200000/8;
                  NMOS_sizing 	  = g_tp.min_w_nmos_;
                  PMOS_sizing	  = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
                  //Power
                  //Cadence ChipEstimate using 65nm the controller 125mW for every 200MB/s This is power not energy!
                  ctrl_dyn      = 0.125*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
                  //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
                  SerDer_dyn   = 0.01*1.6*(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
                  //max  Per controller speed is 1.6Gb/s (200MB/s)
          }
          double number_channel = 1+(fcp.num_channels-1)*0.2;
          area.set_area((ctrl_area + (fcp.withPHY? SerDer_area:0))*1e6*number_channel);
          power_t.readOp.dynamic = (ctrl_dyn + (fcp.withPHY? SerDer_dyn:0))*number_channel;
          power_t.readOp.leakage = ((ctrl_gates + (fcp.withPHY? SerDer_gates:0))*number_channel)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
          double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
          power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
          power_t.readOp.gate_leakage = ((ctrl_gates + (fcp.withPHY? SerDer_gates:0))*number_channel)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
 }
 void FlashController::computeEnergy(bool is_tdp)
 {
        if (is_tdp)
    {
                power	= power_t;
        power.readOp.dynamic *= fcp.duty_cycle;
    }
    else
    {
        rt_power = power_t;
        rt_power.readOp.dynamic *= fcp.perc_load;
    }
 }
 void FlashController::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
 {
        string indent_str(indent, ' ');
        string indent_str_next(indent+2, ' ');
        bool long_channel = XML->sys.longer_channel_device;
        if (is_tdp)
        {
                cout << "Flash Controller:" << endl;
                cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
                cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic << " W" << endl;//no multiply of clock since this is power already
                cout << indent_str<< "Subthreshold Leakage = "
                        << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
                //cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
                cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
                cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic << " W" << endl;
                cout<<endl;
        }
        else
        {
        }
 }
 void FlashController::set_fc_param()
 {
 //	  fcp.clockRate       = XML->sys.flashc.mc_clock;
 //	  fcp.clockRate       *= 1e6;
          fcp.peakDataTransferRate = XML->sys.flashc.peak_transfer_rate;
          fcp.num_channels    = ceil(fcp.peakDataTransferRate/200);
          fcp.num_mcs         = XML->sys.flashc.number_mcs;
          fcp.duty_cycle      = XML->sys.flashc.duty_cycle;
          fcp.perc_load       = XML->sys.flashc.total_load_perc;
          fcp.type            = XML->sys.flashc.type;
          fcp.withPHY         = XML->sys.flashc.withPHY;
 //	  flashcp.executionTime   = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
 }
--- a/ext/mcpat/iocontrollers.h
+++ b/ext/mcpat/iocontrollers.h
@ -0,0 +1,87 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef IOCONTROLLERS_H_
 #define IOCONTROLLERS_H_
 #endif /* IOCONTROLLERS_H_ */
 #include "XML_Parse.h"
 #include "parameter.h"
 //#include "io.h"
 #include "array.h"
 //#include "Undifferentiated_Core_Area.h"
 #include <vector>
 #include "basic_components.h"
 class NIUController : public Component {
  public:
        ParseXML *XML;
        InputParameter interface_ip;
    NIUParam  niup;
    powerDef power_t;
    uca_org_t local_result;
    NIUController(ParseXML *XML_interface,InputParameter* interface_ip_);
    void set_niu_param();
    void computeEnergy(bool is_tdp=true);
    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
    ~NIUController(){};
 };
 class PCIeController : public Component {
  public:
        ParseXML *XML;
        InputParameter interface_ip;
    PCIeParam  pciep;
    powerDef power_t;
    uca_org_t local_result;
    PCIeController(ParseXML *XML_interface,InputParameter* interface_ip_);
    void set_pcie_param();
    void computeEnergy(bool is_tdp=true);
    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
    ~PCIeController(){};
 };
 class FlashController : public Component {
  public:
        ParseXML *XML;
        InputParameter interface_ip;
    MCParam  fcp;
    powerDef power_t;
    uca_org_t local_result;
    FlashController(ParseXML *XML_interface,InputParameter* interface_ip_);
    void set_fc_param();
    void computeEnergy(bool is_tdp=true);
    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
    ~FlashController(){};
 };
--- a/ext/mcpat/logic.cc
+++ b/ext/mcpat/logic.cc
--- a/ext/mcpat/logic.h
+++ b/ext/mcpat/logic.h
@ -0,0 +1,233 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef LOGIC_H_
 #define LOGIC_H_
 #include <cassert>
 #include <cmath>
 #include <cstring>
 #include <iostream>
 #include "XML_Parse.h"
 #include "arch_const.h"
 #include "basic_circuit.h"
 #include "basic_components.h"
 #include "cacti_interface.h"
 #include "component.h"
 #include "const.h"
 #include "decoder.h"
 #include "parameter.h"
 #include "xmlParser.h"
 using namespace std;
 class selection_logic : public Component{
 public:
        selection_logic(bool _is_default, int    win_entries_,
                            int  issue_width_, const InputParameter *configure_interface,
                            enum Device_ty device_ty_=Core_device,
                            enum Core_type core_ty_=Inorder);//, const ParseXML *_XML_interface);
        bool is_default;
        InputParameter l_ip;
        uca_org_t local_result;
        const ParseXML *XML_interface;
        int win_entries;
        int issue_width;
        int num_threads;
        enum Device_ty device_ty;
        enum Core_type core_ty;
        void selection_power();
        void leakage_feedback(double temperature); // TODO
 };
 class dep_resource_conflict_check : public Component{
 public:
        dep_resource_conflict_check(const InputParameter *configure_interface, const CoreDynParam & dyn_p_, int compare_bits_, bool _is_default=true);
        InputParameter l_ip;
        uca_org_t local_result;
        double WNORn, WNORp, Wevalinvp, Wevalinvn, Wcompn, Wcompp, Wcomppreequ;
        CoreDynParam  coredynp;
        int compare_bits;
        bool is_default;
        statsDef       tdp_stats;
        statsDef       rtp_stats;
        statsDef       stats_t;
        powerDef       power_t;
        void conflict_check_power();
        double compare_cap();
        ~dep_resource_conflict_check(){
                local_result.cleanup();
        }
        void leakage_feedback(double temperature);
 };
 class inst_decoder: public Component{
 public:
        inst_decoder(bool _is_default, const InputParameter *configure_interface,
                        int opcode_length_,
                        int num_decoders_,
                        bool x86_,
                        enum Device_ty device_ty_=Core_device,
                        enum Core_type core_ty_=Inorder);
        inst_decoder();
        bool is_default;
        int  opcode_length;
        int  num_decoders;
        bool x86;
        int  num_decoder_segments;
        int  num_decoded_signals;
        InputParameter l_ip;
        uca_org_t local_result;
        enum Device_ty device_ty;
        enum Core_type core_ty;
        Decoder * final_dec;
        Predec *  pre_dec;
        statsDef       tdp_stats;
        statsDef       rtp_stats;
        statsDef       stats_t;
        powerDef       power_t;
        void inst_decoder_delay_power();
        ~inst_decoder();
        void leakage_feedback(double temperature);
 };
 class DFFCell : public Component {
 public:
        DFFCell(bool _is_dram, double _WdecNANDn, double _WdecNANDp,double _cell_load,
                          const InputParameter *configure_interface);
        InputParameter l_ip;
        bool is_dram;
        double cell_load;
        double WdecNANDn;
        double WdecNANDp;
        double clock_cap;
        int    model;
        int    n_switch;
        int    n_keep_1;
        int    n_keep_0;
        int    n_clock;
        powerDef e_switch;
        powerDef e_keep_1;
        powerDef e_keep_0;
        powerDef e_clock;
        double fpfp_node_cap(unsigned int fan_in, unsigned int fan_out);
        void compute_DFF_cell(void);
        };
 class Pipeline : public Component{
 public:
        Pipeline(const InputParameter *configure_interface, const CoreDynParam & dyn_p_, enum Device_ty device_ty_=Core_device, bool _is_core_pipeline=true, bool _is_default=true);
        InputParameter l_ip;
        uca_org_t local_result;
        CoreDynParam  coredynp;
        enum Device_ty device_ty;
        bool is_core_pipeline, is_default;
        double num_piperegs;
 //	int pipeline_stages;
 //	int tot_stage_vector, per_stage_vector;
        bool process_ind;
        double WNANDn ;
        double WNANDp;
        double load_per_pipeline_stage;
 //	int  Hthread,  num_thread, fetchWidth, decodeWidth, issueWidth, commitWidth, instruction_length;
 //	int  PC_width, opcode_length, num_arch_reg_tag, data_width,num_phsical_reg_tag, address_width;
 //	bool thread_clock_gated;
 //	bool in_order, multithreaded;
        void compute_stage_vector();
        void compute();
        ~Pipeline(){
                local_result.cleanup();
        };
 };
 //class core_pipeline :public pipeline{
 //public:
 //	int  Hthread,  num_thread, fetchWidth, decodeWidth, issueWidth, commitWidth, instruction_length;
 //	int  PC_width, opcode_length, num_arch_reg_tag, data_width,num_phsical_reg_tag, address_width;
 //	bool thread_clock_gated;
 //	bool in_order, multithreaded;
 //	core_pipeline(bool _is_default, const InputParameter *configure_interface);
 //	virtual void compute_stage_vector();
 //
 //};
 class FunctionalUnit :public Component{
 public:
        ParseXML *XML;
        int  ithCore;
        InputParameter interface_ip;
        CoreDynParam  coredynp;
        double FU_height;
        double clockRate,executionTime;
        double num_fu;
        double energy, base_energy,per_access_energy, leakage, gate_leakage;
        bool  is_default;
        enum FU_type fu_type;
        statsDef       tdp_stats;
        statsDef       rtp_stats;
        statsDef       stats_t;
        powerDef       power_t;
        FunctionalUnit(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, enum FU_type fu_type);
    void computeEnergy(bool is_tdp=true);
        void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
    void leakage_feedback(double temperature);
 };
 class UndiffCore :public Component{
 public:
        UndiffCore(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_=true, bool embedded_=false);
        ParseXML *XML;
        int  ithCore;
        InputParameter interface_ip;
        CoreDynParam  coredynp;
        double clockRate,executionTime;
        double scktRatio, chip_PR_overhead, macro_PR_overhead;
        enum  Core_type core_ty;
        bool   opt_performance, embedded;
        double pipeline_stage,num_hthreads,issue_width;
        bool   is_default;
    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
        ~UndiffCore(){};
        bool exist;
 };
 #endif /* LOGIC_H_ */
--- a/ext/mcpat/main.cc
+++ b/ext/mcpat/main.cc
@ -0,0 +1,101 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <iostream>
 #include "XML_Parse.h"
 #include "globalvar.h"
 #include "io.h"
 #include "processor.h"
 #include "version.h"
 #include "xmlParser.h"
 using namespace std;
 void print_usage(char * argv0);
 int main(int argc,char *argv[])
 {
        char * fb ;
        bool infile_specified     = false;
        int  plevel               = 2;
        opt_for_clk	=true;
        //cout.precision(10);
        if (argc <= 1 || argv[1] == string("-h") || argv[1] == string("--help"))
        {
                print_usage(argv[0]);
        }
        for (int32_t i = 0; i < argc; i++)
        {
                if (argv[i] == string("-infile"))
                {
                        infile_specified = true;
                        i++;
                        fb = argv[ i];
                }
                if (argv[i] == string("-print_level"))
                {
                        i++;
                        plevel = atoi(argv[i]);
                }
                if (argv[i] == string("-opt_for_clk"))
                {
                        i++;
                        opt_for_clk = (bool)atoi(argv[i]);
                }
        }
        if (infile_specified == false)
        {
                print_usage(argv[0]);
        }
        cout<<"McPAT (version "<< VER_MAJOR <<"."<< VER_MINOR
                << " of " << VER_UPDATE << ") is computing the target processor...\n "<<endl;
        //parse XML-based interface
        ParseXML *p1= new ParseXML();
        p1->parse(fb);
        Processor proc(p1);
        proc.displayEnergy(2, plevel);
        delete p1;
        return 0;
 }
 void print_usage(char * argv0)
 {
    cerr << "How to use McPAT:" << endl;
    cerr << "  mcpat -infile <input file name>  -print_level < level of details 0~5 >  -opt_for_clk < 0 (optimize for ED^2P only)/1 (optimzed for target clock rate)>"<< endl;
    //cerr << "    Note:default print level is at processor level, please increase it to see the details" << endl;
    exit(1);
 }
--- a/ext/mcpat/makefile
+++ b/ext/mcpat/makefile
@ -0,0 +1,28 @@
 TAR = mcpat
 .PHONY: dbg opt depend clean clean_dbg clean_opt
 all: opt
 dbg: $(TAR).mk obj_dbg
 	@$(MAKE) TAG=dbg -C . -f $(TAR).mk
 opt: $(TAR).mk obj_opt
 	@$(MAKE) TAG=opt -C . -f $(TAR).mk
 obj_dbg:
 	mkdir $@
 obj_opt:
 	mkdir $@
 clean: clean_dbg clean_opt
 clean_dbg: obj_dbg
 	@$(MAKE) TAG=dbg -C . -f $(TAR).mk clean
 	rm -rf $<
 clean_opt: obj_opt
 	@$(MAKE) TAG=opt -C . -f $(TAR).mk clean
 	rm -rf $<
--- a/ext/mcpat/mcpat.mk
+++ b/ext/mcpat/mcpat.mk
@ -0,0 +1,81 @@
 TARGET = mcpat
 SHELL = /bin/sh
 .PHONY: all depend clean
 .SUFFIXES: .cc .o
 ifndef NTHREADS
  NTHREADS = 4
 endif
 LIBS = 
 INCS = -lm
 ifeq ($(TAG),dbg)
  DBG = -Wall 
  OPT = -ggdb -g -O0 -DNTHREADS=1 -Icacti
 else
  DBG = 
  OPT = -O3 -msse2 -mfpmath=sse -DNTHREADS=$(NTHREADS) -Icacti
  #OPT = -O0 -DNTHREADS=$(NTHREADS)
 endif
 #CXXFLAGS = -Wall -Wno-unknown-pragmas -Winline $(DBG) $(OPT) 
 CXXFLAGS = -Wno-unknown-pragmas $(DBG) $(OPT) 
 CXX = g++ -m32
 CC  = gcc -m32
 VPATH = cacti
 SRCS  = \
  Ucache.cc \
  XML_Parse.cc \
  arbiter.cc \
  area.cc \
  array.cc \
  bank.cc \
  basic_circuit.cc \
  basic_components.cc \
  cacti_interface.cc \
  component.cc \
  core.cc \
  crossbar.cc \
  decoder.cc \
  htree2.cc \
  interconnect.cc \
  io.cc \
  iocontrollers.cc \
  logic.cc \
  main.cc \
  mat.cc \
  memoryctrl.cc \
  noc.cc \
  nuca.cc \
  parameter.cc \
  processor.cc \
  router.cc \
  sharedcache.cc \
  subarray.cc \
  technology.cc \
  uca.cc \
  wire.cc \
  xmlParser.cc 
 OBJS = $(patsubst %.cc,obj_$(TAG)/%.o,$(SRCS))
 all: obj_$(TAG)/$(TARGET)
 	cp -f obj_$(TAG)/$(TARGET) $(TARGET)
 obj_$(TAG)/$(TARGET) : $(OBJS)
 	$(CXX) $(OBJS) -o $@ $(INCS) $(CXXFLAGS) $(LIBS) -pthread
 #obj_$(TAG)/%.o : %.cc
 #	$(CXX) -c $(CXXFLAGS) $(INCS) -o $@ $<
 obj_$(TAG)/%.o : %.cc
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 clean:
 	-rm -f *.o $(TARGET)
--- a/ext/mcpat/mcpatXeonCore.mk
+++ b/ext/mcpat/mcpatXeonCore.mk
@ -0,0 +1,81 @@
 TARGET = mcpatXeonCore
 SHELL = /bin/sh
 .PHONY: all depend clean
 .SUFFIXES: .cc .o
 ifndef NTHREADS
  NTHREADS = 4
 endif
 LIBS = 
 INCS = -lm
 ifeq ($(TAG),dbg)
  DBG = -Wall 
  OPT = -ggdb -g -O0 -DNTHREADS=1 -Icacti
 else
  DBG = 
  OPT = -O3 -msse2 -mfpmath=sse -DNTHREADS=$(NTHREADS) -Icacti
  #OPT = -O0 -DNTHREADS=$(NTHREADS)
 endif
 #CXXFLAGS = -Wall -Wno-unknown-pragmas -Winline $(DBG) $(OPT) 
 CXXFLAGS = -Wno-unknown-pragmas $(DBG) $(OPT) 
 CXX = g++ -m32
 CC  = gcc -m32
 VPATH = cacti
 SRCS  = \
  Ucache.cc \
  XML_Parse.cc \
  arbiter.cc \
  area.cc \
  array.cc \
  bank.cc \
  basic_circuit.cc \
  basic_components.cc \
  cacti_interface.cc \
  component.cc \
  core.cc \
  crossbar.cc \
  decoder.cc \
  htree2.cc \
  interconnect.cc \
  io.cc \
  iocontrollers.cc \
  logic.cc \
  main.cc \
  mat.cc \
  memoryctrl.cc \
  noc.cc \
  nuca.cc \
  parameter.cc \
  processor.cc \
  router.cc \
  sharedcache.cc \
  subarray.cc \
  technology_xeon_core.cc \
  uca.cc \
  wire.cc \
  xmlParser.cc 
 OBJS = $(patsubst %.cc,obj_$(TAG)/%.o,$(SRCS))
 all: obj_$(TAG)/$(TARGET)
 	cp -f obj_$(TAG)/$(TARGET) $(TARGET)
 obj_$(TAG)/$(TARGET) : $(OBJS)
 	$(CXX) $(OBJS) -o $@ $(INCS) $(CXXFLAGS) $(LIBS) -pthread
 #obj_$(TAG)/%.o : %.cc
 #	$(CXX) -c $(CXXFLAGS) $(INCS) -o $@ $<
 obj_$(TAG)/%.o : %.cc
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 clean:
 	-rm -f *.o $(TARGET)
--- a/ext/mcpat/memoryctrl.cc
+++ b/ext/mcpat/memoryctrl.cc
@ -0,0 +1,736 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <iostream>
 #include <string>
 #include "XML_Parse.h"
 #include "basic_circuit.h"
 #include "basic_components.h"
 #include "const.h"
 #include "io.h"
 #include "logic.h"
 #include "memoryctrl.h"
 #include "parameter.h"
 /* overview of MC models:
 * McPAT memory controllers are modeled according to large number of industrial data points.
 * The Basic memory controller architecture is base on the Synopsis designs
 * (DesignWare DDR2/DDR3-Lite memory controllers and DDR2/DDR3-Lite protocol controllers)
 * as in Cadence ChipEstimator Tool
 *
 * An MC has 3 parts as shown in this design. McPAT models both high performance MC
 * based on Niagara processor designs and curving and low power MC based on data points in
 * Cadence ChipEstimator Tool.
 *
 * The frontend is modeled analytically, the backend is modeled empirically according to
 * DDR2/DDR3-Lite protocol controllers in Cadence ChipEstimator Tool
 * The PHY is modeled based on
 * "A 100mW 9.6Gb/s Transceiver in 90nm CMOS for next-generation memory interfaces ," ISSCC 2006,
 * and A 14mW 6.25Gb/s Transceiver in 90nm CMOS for Serial Chip-to-Chip Communication," ISSCC 2007
 *
 * In Cadence ChipEstimator Tool there are two types of memory controllers: the full memory controllers
 * that includes the frontend as the DesignWare DDR2/DDR3-Lite memory controllers and the backend only
 * memory controllers as the DDR2/DDR3-Lite protocol controllers (except DesignWare DDR2/DDR3-Lite memory
 * controllers, all memory controller IP in Cadence ChipEstimator Tool are backend memory controllers such as
 * DDRC 1600A and DDRC 800A). Thus,to some extend the area and power difference between DesignWare
 * DDR2/DDR3-Lite memory controllers and DDR2/DDR3-Lite protocol controllers can be an estimation to the
 * frontend power and area, which is very close the analitically modeled results of the frontend for Niagara2@65nm
 *
 */
 MCBackend::MCBackend(InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_)
 :l_ip(*interface_ip_),
 mc_type(mc_type_),
 mcp(mcp_)
 {
  local_result = init_interface(&l_ip);
  compute();
 }
 void MCBackend::compute()
 {
  //double max_row_addr_width = 20.0;//Current address 12~18bits
  double C_MCB, mc_power, backend_dyn, backend_gates;//, refresh_period,refresh_freq;//Equivalent per bit Cap for backend,
  double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
  double NMOS_sizing, PMOS_sizing;
  if (mc_type == MC)
  {
          if (mcp.type == 0)
          {
                  //area = (2.2927*log(peakDataTransferRate)-14.504)*memDataWidth/144.0*(l_ip.F_sz_um/0.09);
                  area.set_area((2.7927*log(mcp.peakDataTransferRate*2)-19.862)/2.0*mcp.dataBusWidth/128.0*(l_ip.F_sz_um/0.09)*mcp.num_channels*1e6);//um^2
                  //assuming the approximately same scaling factor as seen in processors.
                  //C_MCB=0.2/1.3/1.3/266/64/0.09*g_ip.F_sz_um;//based on AMD Geode processor which has a very basic mc on chip.
                  //C_MCB = 1.6/200/1e6/144/1.2/1.2*g_ip.F_sz_um/0.19;//Based on Niagara power numbers.The base power (W) is divided by device frequency and vdd and scale to target process.
                  //mc_power = 0.0291*2;//29.1mW@200MHz @130nm From Power Analysis of SystemLevel OnChip Communication Architectures by Lahiri et
                  mc_power = 4.32*0.1;//4.32W@1GhzMHz @65nm Cadence ChipEstimator 10% for backend
                  C_MCB = mc_power/1e9/72/1.1/1.1*l_ip.F_sz_um/0.065;
                  power_t.readOp.dynamic = C_MCB*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(mcp.dataBusWidth/*+mcp.addressBusWidth*/);//per access energy in memory controller
                  power_t.readOp.leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
                  power_t.readOp.gate_leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
          }
          else
          {   NMOS_sizing 	  = g_tp.min_w_nmos_;
                  PMOS_sizing	  = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
                  area.set_area(0.15*mcp.dataBusWidth/72.0*(l_ip.F_sz_um/0.065)* (l_ip.F_sz_um/0.065)*mcp.num_channels*1e6);//um^2
                  backend_dyn = 0.9e-9/800e6*mcp.clockRate/12800*mcp.peakDataTransferRate*mcp.dataBusWidth/72.0*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(l_ip.F_sz_nm/65.0);//Average on DDR2/3 protocol controller and DDRC 1600/800A in Cadence ChipEstimate
                  //Scaling to technology and DIMM feature. The base IP support DDR3-1600(PC3 12800)
                  backend_gates = 50000*mcp.dataBusWidth/64.0;//5000 is from Cadence ChipEstimator
                  power_t.readOp.dynamic = backend_dyn;
                  power_t.readOp.leakage = (backend_gates)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
                  power_t.readOp.gate_leakage = (backend_gates)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
          }
  }
  else
  {//skip old model
          cout<<"Unknown memory controllers"<<endl;exit(0);
          area.set_area(0.243*mcp.dataBusWidth/8);//area based on Cadence ChipEstimator for 8bit bus
          //mc_power = 4.32*0.1;//4.32W@1GhzMHz @65nm Cadence ChipEstimator 10% for backend
          C_MCB = mc_power/1e9/72/1.1/1.1*l_ip.F_sz_um/0.065;
          power_t.readOp.leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
          power_t.readOp.gate_leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
          power_t.readOp.dynamic *= 1.2;
          power_t.readOp.leakage *= 1.2;
          power_t.readOp.gate_leakage *= 1.2;
          //flash controller has about 20% more backend power since BCH ECC in flash is complex and power hungry
  }
  double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
  power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
 }
 void MCBackend::computeEnergy(bool is_tdp)
 {
        //backend uses internal data buswidth
        if (is_tdp)
        {
                //init stats for Peak
                stats_t.readAc.access   = 0.5*mcp.num_channels;
                stats_t.writeAc.access  = 0.5*mcp.num_channels;
                tdp_stats = stats_t;
        }
        else
        {
                //init stats for runtime power (RTP)
                stats_t.readAc.access   = mcp.reads;
                stats_t.writeAc.access  = mcp.writes;
                tdp_stats = stats_t;
        }
        if (is_tdp)
    {
                power = power_t;
                power.readOp.dynamic	= (stats_t.readAc.access + stats_t.writeAc.access)*power_t.readOp.dynamic;
    }
    else
    {
        rt_power.readOp.dynamic	= (stats_t.readAc.access + stats_t.writeAc.access)*mcp.llcBlockSize*8.0/mcp.dataBusWidth*power_t.readOp.dynamic;
        rt_power = rt_power + power_t*pppm_lkg;
        rt_power.readOp.dynamic = rt_power.readOp.dynamic + power.readOp.dynamic*0.1*mcp.clockRate*mcp.num_mcs*mcp.executionTime;
        //Assume 10% of peak power is consumed by routine job including memory refreshing and scrubbing
    }
 }
 MCPHY::MCPHY(InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_)
 :l_ip(*interface_ip_),
 mc_type(mc_type_),
 mcp(mcp_)
 {
  local_result = init_interface(&l_ip);
  compute();
 }
 void MCPHY::compute()
 {
  //PHY uses internal data buswidth but the actuall off-chip datawidth is 64bits + ecc
  double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio() ;
  /*
   * according to "A 100mW 9.6Gb/s Transceiver in 90nm CMOS for next-generation memory interfaces ," ISSCC 2006;
   * From Cadence ChipEstimator for normal I/O around 0.4~0.8 mW/Gb/s
   */
  double power_per_gb_per_s, phy_dyn,phy_gates, NMOS_sizing, PMOS_sizing;
  if (mc_type == MC)
  {
          if (mcp.type == 0)
          {
                  power_per_gb_per_s = mcp.LVDS? 0.01:0.04;
                  //Based on die photos from Niagara 1 and 2.
                  //TODO merge this into undifferentiated core.PHY only achieves square root of the ideal scaling.
                  //area = (6.4323*log(peakDataTransferRate)-34.76)*memDataWidth/128.0*(l_ip.F_sz_um/0.09);
                  area.set_area((6.4323*log(mcp.peakDataTransferRate*2)-48.134)*mcp.dataBusWidth/128.0*(l_ip.F_sz_um/0.09)*mcp.num_channels*1e6/2);//TODO:/2
                  //This is from curve fitting based on Niagara 1 and 2's PHY die photo.
                  //This is power not energy, 10mw/Gb/s @90nm for each channel and scaling down
                  //power.readOp.dynamic = 0.02*memAccesses*llcBlocksize*8;//change from Bytes to bits.
                  power_t.readOp.dynamic = power_per_gb_per_s*sqrt(l_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
                  power_t.readOp.leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
                  power_t.readOp.gate_leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
          }
          else
          {
                  NMOS_sizing 	  = g_tp.min_w_nmos_;
                  PMOS_sizing	  = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
                  //Designware/synopsis 16bit DDR3 PHY is 1.3mm (WITH IOs) at 40nm for upto DDR3 2133 (PC3 17066)
                  double non_IO_percentage = 0.2;
                  area.set_area(1.3*non_IO_percentage/2133.0e6*mcp.clockRate/17066*mcp.peakDataTransferRate*mcp.dataBusWidth/16.0*(l_ip.F_sz_um/0.040)* (l_ip.F_sz_um/0.040)*mcp.num_channels*1e6);//um^2
                  phy_gates = 200000*mcp.dataBusWidth/64.0;
                  power_per_gb_per_s = 0.01;
                  //This is power not energy, 10mw/Gb/s @90nm for each channel and scaling down
                  power_t.readOp.dynamic = power_per_gb_per_s*(l_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
                  power_t.readOp.leakage = (mcp.withPHY? phy_gates:0)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
                  power_t.readOp.gate_leakage = (mcp.withPHY? phy_gates:0)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
          }
  }
  else
  {
          area.set_area(0.4e6/2*mcp.dataBusWidth/8);//area based on Cadence ChipEstimator for 8bit bus
  }
 //  double phy_factor = (int)ceil(mcp.dataBusWidth/72.0);//Previous phy power numbers are based on 72 bit DIMM interface
 //  power_t.readOp.dynamic *= phy_factor;
 //  power_t.readOp.leakage *= phy_factor;
 //  power_t.readOp.gate_leakage *= phy_factor;
  double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
  power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
 }
 void MCPHY::computeEnergy(bool is_tdp)
 {
        if (is_tdp)
        {
                //init stats for Peak
                stats_t.readAc.access   = 0.5*mcp.num_channels; //time share on buses
                stats_t.writeAc.access  = 0.5*mcp.num_channels;
                tdp_stats = stats_t;
        }
        else
        {
                //init stats for runtime power (RTP)
                stats_t.readAc.access   = mcp.reads;
                stats_t.writeAc.access  = mcp.writes;
                tdp_stats = stats_t;
        }
        if (is_tdp)
    {
                double data_transfer_unit = (mc_type == MC)? 72:16;/*DIMM data width*/
                power = power_t;
                power.readOp.dynamic	= power.readOp.dynamic * (mcp.peakDataTransferRate*8*1e6/1e9/*change to Gbs*/)*mcp.dataBusWidth/data_transfer_unit*mcp.num_channels/mcp.clockRate;
                // divide by clock rate is for match the final computation where *clock is used
                //(stats_t.readAc.access*power_t.readOp.dynamic+
 //					stats_t.writeAc.access*power_t.readOp.dynamic);
    }
    else
    {
        rt_power = power_t;
 //    	rt_power.readOp.dynamic	= (stats_t.readAc.access*power_t.readOp.dynamic+
 //    						stats_t.writeAc.access*power_t.readOp.dynamic);
        rt_power.readOp.dynamic=power_t.readOp.dynamic*(stats_t.readAc.access + stats_t.writeAc.access)*(mcp.llcBlockSize)*8/1e9/mcp.executionTime*(mcp.executionTime);
        rt_power.readOp.dynamic = rt_power.readOp.dynamic + power.readOp.dynamic*0.1*mcp.clockRate*mcp.num_mcs*mcp.executionTime;
    }
 }
 MCFrontEnd::MCFrontEnd(ParseXML *XML_interface,InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_)
 :XML(XML_interface),
 interface_ip(*interface_ip_),
 mc_type(mc_type_),
 mcp(mcp_),
 MC_arb(0),
 frontendBuffer(0),
 readBuffer(0),
 writeBuffer(0)
 {
  /* All computations are for a single MC
   *
   */
  int tag, data;
  bool is_default =true;//indication for default setup
  /* MC frontend engine channels share the same engines but logically partitioned
   * For all hardware inside MC. different channels do not share resources.
   * TODO: add docodeing/mux stage to steer memory requests to different channels.
   */
  //memory request reorder buffer
  tag							   = mcp.addressBusWidth  + EXTRA_TAG_BITS + mcp.opcodeW;
  data    					 	   = int(ceil((XML->sys.physical_address_width + mcp.opcodeW)/8.0));
  interface_ip.cache_sz            = data*XML->sys.mc.req_window_size_per_channel;
  interface_ip.line_sz             = data;
  interface_ip.assoc               = 0;
  interface_ip.nbanks              = 1;
  interface_ip.out_w               = interface_ip.line_sz*8;
  interface_ip.specific_tag        = 1;
  interface_ip.tag_w               = tag;
  interface_ip.access_mode         = 0;
  interface_ip.throughput          = 1.0/mcp.clockRate;
  interface_ip.latency             = 1.0/mcp.clockRate;
  interface_ip.is_cache			   = true;
  interface_ip.pure_cam            = false;
  interface_ip.pure_ram            = false;
  interface_ip.obj_func_dyn_energy = 0;
  interface_ip.obj_func_dyn_power  = 0;
  interface_ip.obj_func_leak_power = 0;
  interface_ip.obj_func_cycle_t    = 1;
  interface_ip.num_rw_ports        = 0;
  interface_ip.num_rd_ports        = XML->sys.mc.memory_channels_per_mc;
  interface_ip.num_wr_ports        = interface_ip.num_rd_ports;
  interface_ip.num_se_rd_ports     = 0;
  interface_ip.num_search_ports     = XML->sys.mc.memory_channels_per_mc;
  frontendBuffer = new ArrayST(&interface_ip, "MC ReorderBuffer", Uncore_device);
  frontendBuffer->area.set_area(frontendBuffer->area.get_area()+ frontendBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
  area.set_area(area.get_area()+ frontendBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
  //selection and arbitration logic
  MC_arb = new selection_logic(is_default, XML->sys.mc.req_window_size_per_channel,1,&interface_ip, Uncore_device);
  //read buffers.
  data    					 	   = (int)ceil(mcp.dataBusWidth/8.0);//Support key words first operation //8 means converting bit to Byte
  interface_ip.cache_sz            = data*XML->sys.mc.IO_buffer_size_per_channel;//*llcBlockSize;
  interface_ip.line_sz             = data;
  interface_ip.assoc               = 1;
  interface_ip.nbanks              = 1;
  interface_ip.out_w               = interface_ip.line_sz*8;
  interface_ip.access_mode         = 1;
  interface_ip.throughput          = 1.0/mcp.clockRate;
  interface_ip.latency             = 1.0/mcp.clockRate;
  interface_ip.is_cache			   = false;
  interface_ip.pure_cam            = false;
  interface_ip.pure_ram            = true;
  interface_ip.obj_func_dyn_energy = 0;
  interface_ip.obj_func_dyn_power  = 0;
  interface_ip.obj_func_leak_power = 0;
  interface_ip.obj_func_cycle_t    = 1;
  interface_ip.num_rw_ports        = 0;//XML->sys.mc.memory_channels_per_mc*2>2?2:XML->sys.mc.memory_channels_per_mc*2;
  interface_ip.num_rd_ports        = XML->sys.mc.memory_channels_per_mc;
  interface_ip.num_wr_ports        = interface_ip.num_rd_ports;
  interface_ip.num_se_rd_ports     = 0;
  readBuffer = new ArrayST(&interface_ip, "MC ReadBuffer", Uncore_device);
  readBuffer->area.set_area(readBuffer->area.get_area()+ readBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
  area.set_area(area.get_area()+ readBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
  //write buffer
  data    					 	   = (int)ceil(mcp.dataBusWidth/8.0);//Support key words first operation //8 means converting bit to Byte
  interface_ip.cache_sz            = data*XML->sys.mc.IO_buffer_size_per_channel;//*llcBlockSize;
  interface_ip.line_sz             = data;
  interface_ip.assoc               = 1;
  interface_ip.nbanks              = 1;
  interface_ip.out_w               = interface_ip.line_sz*8;
  interface_ip.access_mode         = 0;
  interface_ip.throughput          = 1.0/mcp.clockRate;
  interface_ip.latency             = 1.0/mcp.clockRate;
  interface_ip.obj_func_dyn_energy = 0;
  interface_ip.obj_func_dyn_power  = 0;
  interface_ip.obj_func_leak_power = 0;
  interface_ip.obj_func_cycle_t    = 1;
  interface_ip.num_rw_ports        = 0;
  interface_ip.num_rd_ports        = XML->sys.mc.memory_channels_per_mc;
  interface_ip.num_wr_ports        = interface_ip.num_rd_ports;
  interface_ip.num_se_rd_ports     = 0;
  writeBuffer = new ArrayST(&interface_ip, "MC writeBuffer", Uncore_device);
  writeBuffer->area.set_area(writeBuffer->area.get_area()+ writeBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
  area.set_area(area.get_area()+ writeBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
 }
 void MCFrontEnd::computeEnergy(bool is_tdp)
 {
        if (is_tdp)
            {
                //init stats for Peak
                frontendBuffer->stats_t.readAc.access  = frontendBuffer->l_ip.num_search_ports;
                frontendBuffer->stats_t.writeAc.access = frontendBuffer->l_ip.num_wr_ports;
                frontendBuffer->tdp_stats = frontendBuffer->stats_t;
                readBuffer->stats_t.readAc.access  = readBuffer->l_ip.num_rd_ports*mcp.frontend_duty_cycle;
                readBuffer->stats_t.writeAc.access = readBuffer->l_ip.num_wr_ports*mcp.frontend_duty_cycle;
                readBuffer->tdp_stats = readBuffer->stats_t;
                writeBuffer->stats_t.readAc.access  = writeBuffer->l_ip.num_rd_ports*mcp.frontend_duty_cycle;
                writeBuffer->stats_t.writeAc.access = writeBuffer->l_ip.num_wr_ports*mcp.frontend_duty_cycle;
                writeBuffer->tdp_stats = writeBuffer->stats_t;
            }
            else
            {
                //init stats for runtime power (RTP)
                frontendBuffer->stats_t.readAc.access  = XML->sys.mc.memory_reads *mcp.llcBlockSize*8.0/mcp.dataBusWidth*mcp.dataBusWidth/72;
                //For each channel, each memory word need to check the address data to achieve best scheduling results.
                //and this need to be done on all physical DIMMs in each logical memory DIMM *mcp.dataBusWidth/72
                frontendBuffer->stats_t.writeAc.access = XML->sys.mc.memory_writes*mcp.llcBlockSize*8.0/mcp.dataBusWidth*mcp.dataBusWidth/72;
                frontendBuffer->rtp_stats = frontendBuffer->stats_t;
                readBuffer->stats_t.readAc.access  = XML->sys.mc.memory_reads*mcp.llcBlockSize*8.0/mcp.dataBusWidth;//support key word first
                readBuffer->stats_t.writeAc.access = XML->sys.mc.memory_reads*mcp.llcBlockSize*8.0/mcp.dataBusWidth;//support key word first
                readBuffer->rtp_stats = readBuffer->stats_t;
                writeBuffer->stats_t.readAc.access  = XML->sys.mc.memory_writes*mcp.llcBlockSize*8.0/mcp.dataBusWidth;
                writeBuffer->stats_t.writeAc.access = XML->sys.mc.memory_writes*mcp.llcBlockSize*8.0/mcp.dataBusWidth;
                writeBuffer->rtp_stats = writeBuffer->stats_t;
            }
        frontendBuffer->power_t.reset();
        readBuffer->power_t.reset();
        writeBuffer->power_t.reset();
 //	frontendBuffer->power_t.readOp.dynamic	+= (frontendBuffer->stats_t.readAc.access*
 //			(frontendBuffer->local_result.power.searchOp.dynamic+frontendBuffer->local_result.power.readOp.dynamic)+
 //    		frontendBuffer->stats_t.writeAc.access*frontendBuffer->local_result.power.writeOp.dynamic);
                frontendBuffer->power_t.readOp.dynamic	+= (frontendBuffer->stats_t.readAc.access +
                                  frontendBuffer->stats_t.writeAc.access)*frontendBuffer->local_result.power.searchOp.dynamic
                                + frontendBuffer->stats_t.readAc.access * frontendBuffer->local_result.power.readOp.dynamic
                                + frontendBuffer->stats_t.writeAc.access*frontendBuffer->local_result.power.writeOp.dynamic;
        readBuffer->power_t.readOp.dynamic	+= (readBuffer->stats_t.readAc.access*
                        readBuffer->local_result.power.readOp.dynamic+
                readBuffer->stats_t.writeAc.access*readBuffer->local_result.power.writeOp.dynamic);
        writeBuffer->power_t.readOp.dynamic	+= (writeBuffer->stats_t.readAc.access*
                        writeBuffer->local_result.power.readOp.dynamic+
                writeBuffer->stats_t.writeAc.access*writeBuffer->local_result.power.writeOp.dynamic);
        if (is_tdp)
    {
        power = power + frontendBuffer->power_t + readBuffer->power_t + writeBuffer->power_t +
                (frontendBuffer->local_result.power +
                                readBuffer->local_result.power +
                                writeBuffer->local_result.power)*pppm_lkg;
    }
    else
    {
        rt_power = rt_power + frontendBuffer->power_t + readBuffer->power_t + writeBuffer->power_t +
                (frontendBuffer->local_result.power +
                                readBuffer->local_result.power +
                                writeBuffer->local_result.power)*pppm_lkg;
        rt_power.readOp.dynamic = rt_power.readOp.dynamic + power.readOp.dynamic*0.1*mcp.clockRate*mcp.num_mcs*mcp.executionTime;
    }
 }
 void MCFrontEnd::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
 {
        string indent_str(indent, ' ');
        string indent_str_next(indent+2, ' ');
        if (is_tdp)
        {
                cout << indent_str << "Front End ROB:" << endl;
                cout << indent_str_next << "Area = " << frontendBuffer->area.get_area()*1e-6<< " mm^2" << endl;
                cout << indent_str_next << "Peak Dynamic = " << frontendBuffer->power.readOp.dynamic*mcp.clockRate << " W" << endl;
                cout << indent_str_next << "Subthreshold Leakage = " << frontendBuffer->power.readOp.leakage <<" W" << endl;
                cout << indent_str_next << "Gate Leakage = " << frontendBuffer->power.readOp.gate_leakage << " W" << endl;
                cout << indent_str_next << "Runtime Dynamic = " << frontendBuffer->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
                cout <<endl;
                cout << indent_str<< "Read Buffer:" << endl;
                cout << indent_str_next << "Area = " << readBuffer->area.get_area()*1e-6  << " mm^2" << endl;
                cout << indent_str_next << "Peak Dynamic = " << readBuffer->power.readOp.dynamic*mcp.clockRate  << " W" << endl;
                cout << indent_str_next << "Subthreshold Leakage = " << readBuffer->power.readOp.leakage  << " W" << endl;
                cout << indent_str_next << "Gate Leakage = " << readBuffer->power.readOp.gate_leakage  << " W" << endl;
                cout << indent_str_next << "Runtime Dynamic = " << readBuffer->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
                cout <<endl;
                cout << indent_str << "Write Buffer:" << endl;
                cout << indent_str_next << "Area = " << writeBuffer->area.get_area() *1e-6 << " mm^2" << endl;
                cout << indent_str_next << "Peak Dynamic = " << writeBuffer->power.readOp.dynamic*mcp.clockRate  << " W" << endl;
                cout << indent_str_next << "Subthreshold Leakage = " << writeBuffer->power.readOp.leakage  << " W" << endl;
                cout << indent_str_next << "Gate Leakage = " << writeBuffer->power.readOp.gate_leakage  << " W" << endl;
                cout << indent_str_next << "Runtime Dynamic = " << writeBuffer->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
                cout <<endl;
        }
        else
        {
                cout << indent_str << "Front End ROB:" << endl;
                cout << indent_str_next << "Area = " << frontendBuffer->area.get_area()*1e-6<< " mm^2" << endl;
                cout << indent_str_next << "Peak Dynamic = " << frontendBuffer->rt_power.readOp.dynamic*mcp.clockRate << " W" << endl;
                cout << indent_str_next << "Subthreshold Leakage = " << frontendBuffer->rt_power.readOp.leakage <<" W" << endl;
                cout << indent_str_next << "Gate Leakage = " << frontendBuffer->rt_power.readOp.gate_leakage << " W" << endl;
                cout <<endl;
                cout << indent_str<< "Read Buffer:" << endl;
                cout << indent_str_next << "Area = " << readBuffer->area.get_area()*1e-6  << " mm^2" << endl;
                cout << indent_str_next << "Peak Dynamic = " << readBuffer->rt_power.readOp.dynamic*mcp.clockRate  << " W" << endl;
                cout << indent_str_next << "Subthreshold Leakage = " << readBuffer->rt_power.readOp.leakage  << " W" << endl;
                cout << indent_str_next << "Gate Leakage = " << readBuffer->rt_power.readOp.gate_leakage  << " W" << endl;
                cout <<endl;
                cout << indent_str << "Write Buffer:" << endl;
                cout << indent_str_next << "Area = " << writeBuffer->area.get_area() *1e-6 << " mm^2" << endl;
                cout << indent_str_next << "Peak Dynamic = " << writeBuffer->rt_power.readOp.dynamic*mcp.clockRate  << " W" << endl;
                cout << indent_str_next << "Subthreshold Leakage = " << writeBuffer->rt_power.readOp.leakage  << " W" << endl;
                cout << indent_str_next << "Gate Leakage = " << writeBuffer->rt_power.readOp.gate_leakage  << " W" << endl;
        }
 }
 MemoryController::MemoryController(ParseXML *XML_interface,InputParameter* interface_ip_, enum MemoryCtrl_type mc_type_)
 :XML(XML_interface),
 interface_ip(*interface_ip_),
 mc_type(mc_type_),
 frontend(0),
 transecEngine(0),
 PHY(0),
 pipeLogic(0)
 {
  /* All computations are for a single MC
   *
   */
  interface_ip.wire_is_mat_type = 2;
  interface_ip.wire_os_mat_type = 2;
  interface_ip.wt               =Global;
  set_mc_param();
  frontend = new MCFrontEnd(XML, &interface_ip, mcp, mc_type);
  area.set_area(area.get_area()+ frontend->area.get_area());
  transecEngine = new MCBackend(&interface_ip, mcp, mc_type);
  area.set_area(area.get_area()+ transecEngine->area.get_area());
  if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
  {
          PHY = new MCPHY(&interface_ip, mcp, mc_type);
          area.set_area(area.get_area()+ PHY->area.get_area());
  }
  //+++++++++Transaction engine +++++++++++++++++ ////TODO needs better numbers, Run the RTL code from OpenSparc.
 //  transecEngine.initialize(&interface_ip);
 //  transecEngine.peakDataTransferRate = XML->sys.mem.peak_transfer_rate;
 //  transecEngine.memDataWidth = dataBusWidth;
 //  transecEngine.memRank = XML->sys.mem.number_ranks;
 //  //transecEngine.memAccesses=XML->sys.mc.memory_accesses;
 //  //transecEngine.llcBlocksize=llcBlockSize;
 //  transecEngine.compute();
 //  transecEngine.area.set_area(XML->sys.mc.memory_channels_per_mc*transecEngine.area.get_area()) ;
 //  area.set_area(area.get_area()+ transecEngine.area.get_area());
 //  ///cout<<"area="<<area<<endl;
 ////
 //  //++++++++++++++PHY ++++++++++++++++++++++++++ //TODO needs better numbers
 //  PHY.initialize(&interface_ip);
 //  PHY.peakDataTransferRate = XML->sys.mem.peak_transfer_rate;
 //  PHY.memDataWidth = dataBusWidth;
 //  //PHY.memAccesses=PHY.peakDataTransferRate;//this is the max power
 //  //PHY.llcBlocksize=llcBlockSize;
 //  PHY.compute();
 //  PHY.area.set_area(XML->sys.mc.memory_channels_per_mc*PHY.area.get_area()) ;
 //  area.set_area(area.get_area()+ PHY.area.get_area());
  ///cout<<"area="<<area<<endl;
 //
 //  interface_ip.pipeline_stages = 5;//normal memory controller has five stages in the pipeline.
 //  interface_ip.per_stage_vector = addressBusWidth + XML->sys.core[0].opcode_width + dataBusWidth;
 //  pipeLogic = new pipeline(is_default, &interface_ip);
 //  //pipeLogic.init_pipeline(is_default, &interface_ip);
 //  pipeLogic->compute_pipeline();
 //  area.set_area(area.get_area()+ pipeLogic->area.get_area()*1e-6);
 //  area.set_area((area.get_area()+mc_area*1e-6)*1.1);//placement and routing overhead
 //
 //
 ////  //clock
 ////  clockNetwork.init_wire_external(is_default, &interface_ip);
 ////  clockNetwork.clk_area           =area*1.1;//10% of placement overhead. rule of thumb
 ////  clockNetwork.end_wiring_level   =5;//toplevel metal
 ////  clockNetwork.start_wiring_level =5;//toplevel metal
 ////  clockNetwork.num_regs           = pipeLogic.tot_stage_vector;
 ////  clockNetwork.optimize_wire();
 }
 void MemoryController::computeEnergy(bool is_tdp)
 {
        frontend->computeEnergy(is_tdp);
        transecEngine->computeEnergy(is_tdp);
        if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
        {
                PHY->computeEnergy(is_tdp);
        }
        if (is_tdp)
        {
                power = power + frontend->power + transecEngine->power;
                if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
                {
                        power = power + PHY->power;
                }
        }
        else
        {
                rt_power = rt_power + frontend->rt_power + transecEngine->rt_power;
                if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
                {
                        rt_power = rt_power + PHY->rt_power;
                }
        }
 }
 void MemoryController::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
 {
        string indent_str(indent, ' ');
        string indent_str_next(indent+2, ' ');
        bool long_channel = XML->sys.longer_channel_device;
        if (is_tdp)
        {
                cout << "Memory Controller:" << endl;
                cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
                cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*mcp.clockRate  << " W" << endl;
                cout << indent_str<< "Subthreshold Leakage = "
                        << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
                //cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
                cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
                cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
                cout<<endl;
                cout << indent_str << "Front End Engine:" << endl;
                cout << indent_str_next << "Area = " << frontend->area.get_area()*1e-6<< " mm^2" << endl;
                cout << indent_str_next << "Peak Dynamic = " << frontend->power.readOp.dynamic*mcp.clockRate << " W" << endl;
                cout << indent_str_next << "Subthreshold Leakage = "
                        << (long_channel? frontend->power.readOp.longer_channel_leakage:frontend->power.readOp.leakage) <<" W" << endl;
                cout << indent_str_next << "Gate Leakage = " << frontend->power.readOp.gate_leakage << " W" << endl;
                cout << indent_str_next << "Runtime Dynamic = " << frontend->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
                cout <<endl;
                if (plevel >2){
                        frontend->displayEnergy(indent+4,is_tdp);
                }
                cout << indent_str << "Transaction Engine:" << endl;
                cout << indent_str_next << "Area = " << transecEngine->area.get_area()*1e-6<< " mm^2" << endl;
                cout << indent_str_next << "Peak Dynamic = " << transecEngine->power.readOp.dynamic*mcp.clockRate << " W" << endl;
                cout << indent_str_next << "Subthreshold Leakage = "
                        << (long_channel? transecEngine->power.readOp.longer_channel_leakage:transecEngine->power.readOp.leakage) <<" W" << endl;
                cout << indent_str_next << "Gate Leakage = " << transecEngine->power.readOp.gate_leakage << " W" << endl;
                cout << indent_str_next << "Runtime Dynamic = " << transecEngine->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
                cout <<endl;
                if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
                {
                        cout << indent_str << "PHY:" << endl;
                        cout << indent_str_next << "Area = " << PHY->area.get_area()*1e-6<< " mm^2" << endl;
                        cout << indent_str_next << "Peak Dynamic = " << PHY->power.readOp.dynamic*mcp.clockRate << " W" << endl;
                        cout << indent_str_next << "Subthreshold Leakage = "
                        << (long_channel? PHY->power.readOp.longer_channel_leakage:PHY->power.readOp.leakage) <<" W" << endl;
                        cout << indent_str_next << "Gate Leakage = " << PHY->power.readOp.gate_leakage << " W" << endl;
                        cout << indent_str_next << "Runtime Dynamic = " << PHY->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
                        cout <<endl;
                }
        }
        else
        {
                cout << "Memory Controller:" << endl;
                cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
                cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*mcp.clockRate << " W" << endl;
                cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl;
                cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
                cout<<endl;
        }
 }
 void MemoryController::set_mc_param()
 {
        if (mc_type==MC)
        {
          mcp.clockRate       =XML->sys.mc.mc_clock*2;//DDR double pumped
          mcp.clockRate       *= 1e6;
          mcp.executionTime   = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
          mcp.llcBlockSize    =int(ceil(XML->sys.mc.llc_line_length/8.0))+XML->sys.mc.llc_line_length;//ecc overhead
          mcp.dataBusWidth    =int(ceil(XML->sys.mc.databus_width/8.0)) + XML->sys.mc.databus_width;
          mcp.addressBusWidth =int(ceil(XML->sys.mc.addressbus_width));//XML->sys.physical_address_width;
          mcp.opcodeW         =16;
          mcp.num_mcs         = XML->sys.mc.number_mcs;
          mcp.num_channels    = XML->sys.mc.memory_channels_per_mc;
          mcp.reads  = XML->sys.mc.memory_reads;
          mcp.writes = XML->sys.mc.memory_writes;
          //+++++++++Transaction engine +++++++++++++++++ ////TODO needs better numbers, Run the RTL code from OpenSparc.
          mcp.peakDataTransferRate = XML->sys.mc.peak_transfer_rate;
          mcp.memRank = XML->sys.mc.number_ranks;
          //++++++++++++++PHY ++++++++++++++++++++++++++ //TODO needs better numbers
          //PHY.memAccesses=PHY.peakDataTransferRate;//this is the max power
          //PHY.llcBlocksize=llcBlockSize;
          mcp.frontend_duty_cycle = 0.5;//for max power, the actual off-chip links is bidirectional but time shared
          mcp.LVDS = XML->sys.mc.LVDS;
          mcp.type = XML->sys.mc.type;
          mcp.withPHY = XML->sys.mc.withPHY;
        }
 //	else if (mc_type==FLASHC)
 //	{
 //		mcp.clockRate       =XML->sys.flashc.mc_clock*2;//DDR double pumped
 //		mcp.clockRate       *= 1e6;
 //		mcp.executionTime   = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
 //
 //		mcp.llcBlockSize    =int(ceil(XML->sys.flashc.llc_line_length/8.0))+XML->sys.flashc.llc_line_length;//ecc overhead
 //		mcp.dataBusWidth    =int(ceil(XML->sys.flashc.databus_width/8.0)) + XML->sys.flashc.databus_width;
 //		mcp.addressBusWidth =int(ceil(XML->sys.flashc.addressbus_width));//XML->sys.physical_address_width;
 //		mcp.opcodeW         =16;
 //		mcp.num_mcs         = XML->sys.flashc.number_mcs;
 //		mcp.num_channels    = XML->sys.flashc.memory_channels_per_mc;
 //		mcp.reads  = XML->sys.flashc.memory_reads;
 //		mcp.writes = XML->sys.flashc.memory_writes;
 //		//+++++++++Transaction engine +++++++++++++++++ ////TODO needs better numbers, Run the RTL code from OpenSparc.
 //		mcp.peakDataTransferRate = XML->sys.flashc.peak_transfer_rate;
 //		mcp.memRank = XML->sys.flashc.number_ranks;
 //		//++++++++++++++PHY ++++++++++++++++++++++++++ //TODO needs better numbers
 //		//PHY.memAccesses=PHY.peakDataTransferRate;//this is the max power
 //		//PHY.llcBlocksize=llcBlockSize;
 //		mcp.frontend_duty_cycle = 0.5;//for max power, the actual off-chip links is bidirectional but time shared
 //		mcp.LVDS = XML->sys.flashc.LVDS;
 //		mcp.type = XML->sys.flashc.type;
 //	}
        else
        {
                cout<<"Unknown memory controller type: neither DRAM controller nor Flash controller" <<endl;
                exit(0);
        }
 }
 MCFrontEnd ::~MCFrontEnd(){
        if(MC_arb) 	               {delete MC_arb; MC_arb = 0;}
        if(frontendBuffer) 	       {delete frontendBuffer; frontendBuffer = 0;}
        if(readBuffer) 	           {delete readBuffer; readBuffer = 0;}
        if(writeBuffer) 	       {delete writeBuffer; writeBuffer = 0;}
 }
 MemoryController ::~MemoryController(){
        if(frontend) 	               {delete frontend; frontend = 0;}
        if(transecEngine) 	           {delete transecEngine; transecEngine = 0;}
        if(PHY) 	                   {delete PHY; PHY = 0;}
        if(pipeLogic) 	               {delete pipeLogic; pipeLogic = 0;}
 }
--- a/ext/mcpat/memoryctrl.h
+++ b/ext/mcpat/memoryctrl.h
@ -0,0 +1,113 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef MEMORYCTRL_H_
 #define MEMORYCTRL_H_
 #include "XML_Parse.h"
 #include "parameter.h"
 //#include "io.h"
 #include "array.h"
 //#include "Undifferentiated_Core_Area.h"
 #include <vector>
 #include "basic_components.h"
 class MCBackend : public Component {
  public:
    InputParameter l_ip;
    uca_org_t local_result;
        enum MemoryCtrl_type mc_type;
    MCParam  mcp;
    statsDef tdp_stats;
    statsDef rtp_stats;
    statsDef stats_t;
    powerDef power_t;
    MCBackend(InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_);
    void compute();
        void computeEnergy(bool is_tdp=true);
    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
    ~MCBackend(){};
 };
 class MCPHY : public Component {
  public:
    InputParameter l_ip;
    uca_org_t local_result;
        enum MemoryCtrl_type mc_type;
    MCParam  mcp;
    statsDef       tdp_stats;
    statsDef       rtp_stats;
    statsDef       stats_t;
    powerDef       power_t;
    MCPHY(InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_);
    void compute();
        void computeEnergy(bool is_tdp=true);
    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
    ~MCPHY(){};
 };
 class MCFrontEnd : public Component {
  public:
        ParseXML *XML;
        InputParameter interface_ip;
        enum MemoryCtrl_type mc_type;
        MCParam  mcp;
        selection_logic * MC_arb;
        ArrayST  * frontendBuffer;
        ArrayST  * readBuffer;
        ArrayST  * writeBuffer;
    MCFrontEnd(ParseXML *XML_interface,InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_);
    void computeEnergy(bool is_tdp=true);
    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
    ~MCFrontEnd();
 };
 class MemoryController : public Component {
  public:
        ParseXML *XML;
        InputParameter interface_ip;
        enum MemoryCtrl_type mc_type;
    MCParam  mcp;
        MCFrontEnd * frontend;
    MCBackend * transecEngine;
    MCPHY	 * PHY;
    Pipeline * pipeLogic;
    //clock_network clockNetwork;
    MemoryController(ParseXML *XML_interface,InputParameter* interface_ip_, enum MemoryCtrl_type mc_type_);
    void set_mc_param();
    void computeEnergy(bool is_tdp=true);
    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
    ~MemoryController();
 };
 #endif /* MEMORYCTRL_H_ */
--- a/ext/mcpat/noc.cc
+++ b/ext/mcpat/noc.cc
@ -0,0 +1,355 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <iostream>
 #include <string>
 #include "XML_Parse.h"
 #include "basic_circuit.h"
 #include "const.h"
 #include "io.h"
 #include "noc.h"
 #include "parameter.h"
 NoC::NoC(ParseXML *XML_interface, int ithNoC_, InputParameter* interface_ip_, double M_traffic_pattern_, double link_len_)
 :XML(XML_interface),
 ithNoC(ithNoC_),
 interface_ip(*interface_ip_),
 router(0),
 link_bus(0),
 link_bus_exist(false),
 router_exist(false),
 M_traffic_pattern(M_traffic_pattern_)
 {
        /*
         * initialize, compute and optimize individual components.
         */
        if (XML->sys.Embedded)
                        {
                        interface_ip.wt                  =Global_30;
                        interface_ip.wire_is_mat_type = 0;
                        interface_ip.wire_os_mat_type = 1;
                        }
                else
                        {
                        interface_ip.wt                  =Global;
                        interface_ip.wire_is_mat_type = 2;
                        interface_ip.wire_os_mat_type = 2;
                        }
        set_noc_param();
        local_result=init_interface(&interface_ip);
        scktRatio = g_tp.sckt_co_eff;
        if (nocdynp.type)
        {/*
                 * if NOC compute router, router links must be computed separately
                 * and called from external
                 * since total chip area must be known first
                 */
                init_router();
        }
        else
        {
                init_link_bus(link_len_); //if bus compute bus
        }
        //  //clock power
        //  clockNetwork.init_wire_external(is_default, &interface_ip);
        //  clockNetwork.clk_area           =area*1.1;//10% of placement overhead. rule of thumb
        //  clockNetwork.end_wiring_level   =5;//toplevel metal
        //  clockNetwork.start_wiring_level =5;//toplevel metal
        //  clockNetwork.num_regs           = corepipe.tot_stage_vector;
        //  clockNetwork.optimize_wire();
 }
 void NoC::init_router()
 {
        router  = new Router(nocdynp.flit_size,
                        nocdynp.virtual_channel_per_port*nocdynp.input_buffer_entries_per_vc,
                        nocdynp.virtual_channel_per_port, &(g_tp.peri_global),
                        nocdynp.input_ports,nocdynp.output_ports, M_traffic_pattern);
        //router->print_router();
        area.set_area(area.get_area()+ router->area.get_area()*nocdynp.total_nodes);
        double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
        router->power.readOp.longer_channel_leakage          = router->power.readOp.leakage * long_channel_device_reduction;
        router->buffer.power.readOp.longer_channel_leakage   = router->buffer.power.readOp.leakage * long_channel_device_reduction;
        router->crossbar.power.readOp.longer_channel_leakage = router->crossbar.power.readOp.leakage * long_channel_device_reduction;
        router->arbiter.power.readOp.longer_channel_leakage  = router->arbiter.power.readOp.leakage * long_channel_device_reduction;
        router_exist = true;
 }
 void NoC ::init_link_bus(double link_len_)
 {
 //	if (nocdynp.min_ports==1 )
        if (nocdynp.type)
                link_name = "Links";
        else
                link_name = "Bus";
        link_len=link_len_;
        assert(link_len>0);
        interface_ip.throughput = nocdynp.link_throughput/nocdynp.clockRate;
        interface_ip.latency = nocdynp.link_latency/nocdynp.clockRate;
        link_len /= (nocdynp.horizontal_nodes + nocdynp.vertical_nodes)/2;
        if (nocdynp.total_nodes >1) link_len /=2; //All links are shared by neighbors
        link_bus = new interconnect(name, Uncore_device, 1, 1, nocdynp.flit_size,
                                  link_len, &interface_ip, 3, true/*pipelinable*/, nocdynp.route_over_perc);
        link_bus_tot_per_Router.area.set_area(link_bus_tot_per_Router.area.get_area()+ link_bus->area.get_area()
                        * nocdynp.global_linked_ports);
        area.set_area(area.get_area()+ link_bus_tot_per_Router.area.get_area()* nocdynp.total_nodes);
        link_bus_exist = true;
 }
 void NoC::computeEnergy(bool is_tdp)
 {
        //power_point_product_masks
        double pppm_t[4]    = {1,1,1,1};
        double M=nocdynp.duty_cycle;
        if (is_tdp)
            {
                //init stats for TDP
                stats_t.readAc.access  = M;
            tdp_stats = stats_t;
            if (router_exist)
            {
                set_pppm(pppm_t, 1*M, 1, 1, 1);//reset traffic pattern
                router->power = router->power*pppm_t;
                set_pppm(pppm_t, nocdynp.total_nodes, nocdynp.total_nodes, nocdynp.total_nodes, nocdynp.total_nodes);
                    power     = power + router->power*pppm_t;
            }
            if (link_bus_exist)
            {
                if (nocdynp.type)
                        set_pppm(pppm_t, 1*M_traffic_pattern*M*(nocdynp.min_ports -1), nocdynp.global_linked_ports,
                                nocdynp.global_linked_ports, nocdynp.global_linked_ports);
                    //reset traffic pattern; local port do not have router links
                else
                        set_pppm(pppm_t, 1*M_traffic_pattern*M*(nocdynp.min_ports), nocdynp.global_linked_ports,
                                                        nocdynp.global_linked_ports, nocdynp.global_linked_ports);//reset traffic pattern
                link_bus_tot_per_Router.power = link_bus->power*pppm_t;
                set_pppm(pppm_t, nocdynp.total_nodes,
                                         nocdynp.total_nodes,
                                         nocdynp.total_nodes,
                                         nocdynp.total_nodes);
                power     = power + link_bus_tot_per_Router.power*pppm_t;
            }
            }
            else
            {
                //init stats for runtime power (RTP)
                stats_t.readAc.access  = XML->sys.NoC[ithNoC].total_accesses;
            rtp_stats = stats_t;
                set_pppm(pppm_t, 1, 0 , 0, 0);
                if (router_exist)
                {
                router->buffer.rt_power.readOp.dynamic = (router->buffer.power.readOp.dynamic + router->buffer.power.writeOp.dynamic)*rtp_stats.readAc.access ;
                router->crossbar.rt_power.readOp.dynamic = router->crossbar.power.readOp.dynamic*rtp_stats.readAc.access ;
                router->arbiter.rt_power.readOp.dynamic = router->arbiter.power.readOp.dynamic*rtp_stats.readAc.access ;
                        router->rt_power = router->rt_power + (router->buffer.rt_power + router->crossbar.rt_power + router->arbiter.rt_power)*pppm_t +
                                        router->power*pppm_lkg;//TDP power must be calculated first!
                        rt_power     = rt_power + router->rt_power;
                }
                if (link_bus_exist)
                {
                        set_pppm(pppm_t, rtp_stats.readAc.access, 1 , 1, rtp_stats.readAc.access);
                        link_bus->rt_power = link_bus->power * pppm_t;
                        rt_power = rt_power + link_bus->rt_power;
                }
            }
 }
 void NoC::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
 {
        string indent_str(indent, ' ');
        string indent_str_next(indent+2, ' ');
        bool long_channel = XML->sys.longer_channel_device;
        double M =M_traffic_pattern*nocdynp.duty_cycle;
        /*only router as a whole has been applied the M_traffic_pattern(0.6 by default) factor in router.cc;
         * 	When power of crossbars, arbiters, etc need to be displayed, the M_traffic_pattern factor need to
         * be applied together with McPAT's extra traffic pattern.
         * */
        if (is_tdp)
        {
                cout << name << endl;
                cout << indent_str << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
                cout << indent_str<< "Peak Dynamic = " << power.readOp.dynamic*nocdynp.clockRate << " W" << endl;
                cout << indent_str << "Subthreshold Leakage = "
                        << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
                cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
                cout << indent_str<< "Runtime Dynamic = " << rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl;
                cout<<endl;
                if (router_exist)
                {
                        cout << indent_str << "Router: " << endl;
                        cout << indent_str_next << "Area = " << router->area.get_area()*1e-6<< " mm^2" << endl;
                        cout << indent_str_next<< "Peak Dynamic = " << router->power.readOp.dynamic*nocdynp.clockRate << " W" << endl;
                        cout << indent_str_next << "Subthreshold Leakage = "
                        << (long_channel? router->power.readOp.longer_channel_leakage:router->power.readOp.leakage)  <<" W" << endl;
                        cout << indent_str_next << "Gate Leakage = " << router->power.readOp.gate_leakage << " W" << endl;
                        cout << indent_str_next<< "Runtime Dynamic = " << router->rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl;
                        cout<<endl;
                        if (plevel >2){
                                cout << indent_str<< indent_str << "Virtual Channel Buffer:" << endl;
                                cout << indent_str<< indent_str_next << "Area = " << router->buffer.area.get_area()*1e-6*nocdynp.input_ports<< " mm^2" << endl;
                                cout << indent_str<< indent_str_next << "Peak Dynamic = " <<(router->buffer.power.readOp.dynamic + router->buffer.power.writeOp.dynamic)
                                *nocdynp.min_ports*M*nocdynp.clockRate << " W" << endl;
                                cout << indent_str<< indent_str_next << "Subthreshold Leakage = "
                                << (long_channel? router->buffer.power.readOp.longer_channel_leakage*nocdynp.input_ports:router->buffer.power.readOp.leakage*nocdynp.input_ports)  <<" W" << endl;
                                cout << indent_str<< indent_str_next << "Gate Leakage = " << router->buffer.power.readOp.gate_leakage*nocdynp.input_ports << " W" << endl;
                                cout << indent_str<< indent_str_next << "Runtime Dynamic = " << router->buffer.rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl;
                                cout <<endl;
                                cout << indent_str<< indent_str<< "Crossbar:" << endl;
                                cout << indent_str<< indent_str_next << "Area = " << router->crossbar.area.get_area()*1e-6  << " mm^2" << endl;
                                cout << indent_str<< indent_str_next << "Peak Dynamic = " << router->crossbar.power.readOp.dynamic*nocdynp.clockRate*nocdynp.min_ports*M << " W" << endl;
                                cout << indent_str<< indent_str_next << "Subthreshold Leakage = "
                                << (long_channel? router->crossbar.power.readOp.longer_channel_leakage:router->crossbar.power.readOp.leakage)  << " W" << endl;
                                cout << indent_str<< indent_str_next << "Gate Leakage = " << router->crossbar.power.readOp.gate_leakage  << " W" << endl;
                                cout << indent_str<< indent_str_next << "Runtime Dynamic = " << router->crossbar.rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl;
                                cout <<endl;
                                cout << indent_str<< indent_str<< "Arbiter:" << endl;
                                cout << indent_str<< indent_str_next << "Peak Dynamic = " << router->arbiter.power.readOp.dynamic*nocdynp.clockRate*nocdynp.min_ports*M  << " W" << endl;
                                cout << indent_str<< indent_str_next << "Subthreshold Leakage = "
                                << (long_channel? router->arbiter.power.readOp.longer_channel_leakage:router->arbiter.power.readOp.leakage)  << " W" << endl;
                                cout << indent_str<< indent_str_next << "Gate Leakage = " << router->arbiter.power.readOp.gate_leakage  << " W" << endl;
                                cout << indent_str<< indent_str_next << "Runtime Dynamic = " << router->arbiter.rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl;
                                cout <<endl;
                        }
                }
                if (link_bus_exist)
                {
                        cout << indent_str << (nocdynp.type? "Per Router ":"") << link_name<<": " << endl;
                        cout << indent_str_next << "Area = " << link_bus_tot_per_Router.area.get_area()*1e-6<< " mm^2" << endl;
                        cout << indent_str_next<< "Peak Dynamic = " << link_bus_tot_per_Router.power.readOp.dynamic*
                                nocdynp.clockRate << " W" << endl;
                        cout << indent_str_next << "Subthreshold Leakage = "
                        << (long_channel? link_bus_tot_per_Router.power.readOp.longer_channel_leakage:link_bus_tot_per_Router.power.readOp.leakage)
                             <<" W" << endl;
                        cout << indent_str_next << "Gate Leakage = " << link_bus_tot_per_Router.power.readOp.gate_leakage
                                << " W" << endl;
                        cout << indent_str_next<< "Runtime Dynamic = " << link_bus->rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl;
                        cout<<endl;
                }
        }
        else
        {
 //		cout << indent_str_next << "Instruction Fetch Unit    Peak Dynamic = " << ifu->rt_power.readOp.dynamic*clockRate << " W" << endl;
 //		cout << indent_str_next << "Instruction Fetch Unit    Subthreshold Leakage = " << ifu->rt_power.readOp.leakage <<" W" << endl;
 //		cout << indent_str_next << "Instruction Fetch Unit    Gate Leakage = " << ifu->rt_power.readOp.gate_leakage << " W" << endl;
 //		cout << indent_str_next << "Load Store Unit   Peak Dynamic = " << lsu->rt_power.readOp.dynamic*clockRate  << " W" << endl;
 //		cout << indent_str_next << "Load Store Unit   Subthreshold Leakage = " << lsu->rt_power.readOp.leakage  << " W" << endl;
 //		cout << indent_str_next << "Load Store Unit   Gate Leakage = " << lsu->rt_power.readOp.gate_leakage  << " W" << endl;
 //		cout << indent_str_next << "Memory Management Unit   Peak Dynamic = " << mmu->rt_power.readOp.dynamic*clockRate  << " W" << endl;
 //		cout << indent_str_next << "Memory Management Unit   Subthreshold Leakage = " << mmu->rt_power.readOp.leakage  << " W" << endl;
 //		cout << indent_str_next << "Memory Management Unit   Gate Leakage = " << mmu->rt_power.readOp.gate_leakage  << " W" << endl;
 //		cout << indent_str_next << "Execution Unit   Peak Dynamic = " << exu->rt_power.readOp.dynamic*clockRate  << " W" << endl;
 //		cout << indent_str_next << "Execution Unit   Subthreshold Leakage = " << exu->rt_power.readOp.leakage  << " W" << endl;
 //		cout << indent_str_next << "Execution Unit   Gate Leakage = " << exu->rt_power.readOp.gate_leakage  << " W" << endl;
        }
 }
 void NoC::set_noc_param()
 {
        nocdynp.type            = XML->sys.NoC[ithNoC].type;
        nocdynp.clockRate       =XML->sys.NoC[ithNoC].clockrate;
        nocdynp.clockRate       *= 1e6;
        nocdynp.executionTime   = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
        nocdynp.flit_size     = XML->sys.NoC[ithNoC].flit_bits;
        if (nocdynp.type)
        {
                nocdynp.input_ports   = XML->sys.NoC[ithNoC].input_ports;
                nocdynp.output_ports  = XML->sys.NoC[ithNoC].output_ports;//later minus 1
                nocdynp.min_ports     = min(nocdynp.input_ports,nocdynp.output_ports);
                nocdynp.global_linked_ports = (nocdynp.input_ports-1) + (nocdynp.output_ports-1);
                /*
                 * 	Except local i/o ports, all ports needs links( global_linked_ports);
                 *  However only min_ports can be fully active simultaneously
                 *  since the fewer number of ports (input or output ) is the bottleneck.
                 */
        }
        else
        {
                nocdynp.input_ports   = 1;
                nocdynp.output_ports  = 1;
                nocdynp.min_ports     = min(nocdynp.input_ports,nocdynp.output_ports);
                nocdynp.global_linked_ports = 1;
        }
        nocdynp.virtual_channel_per_port     = XML->sys.NoC[ithNoC].virtual_channel_per_port;
        nocdynp.input_buffer_entries_per_vc  = XML->sys.NoC[ithNoC].input_buffer_entries_per_vc;
        nocdynp.horizontal_nodes  = XML->sys.NoC[ithNoC].horizontal_nodes;
        nocdynp.vertical_nodes    = XML->sys.NoC[ithNoC].vertical_nodes;
        nocdynp.total_nodes       = nocdynp.horizontal_nodes*nocdynp.vertical_nodes;
        nocdynp.duty_cycle        = XML->sys.NoC[ithNoC].duty_cycle;
        nocdynp.has_global_link   = XML->sys.NoC[ithNoC].has_global_link;
        nocdynp.link_throughput   = XML->sys.NoC[ithNoC].link_throughput;
        nocdynp.link_latency      = XML->sys.NoC[ithNoC].link_latency;
        nocdynp.chip_coverage     = XML->sys.NoC[ithNoC].chip_coverage;
        nocdynp.route_over_perc   = XML->sys.NoC[ithNoC].route_over_perc;
        assert (nocdynp.chip_coverage <=1);
        assert (nocdynp.route_over_perc <=1);
        if (nocdynp.type)
                name = "NOC";
        else
                name = "BUSES";
 }
 NoC ::~NoC(){
        if(router) 	               {delete router; router = 0;}
        if(link_bus) 	           {delete link_bus; link_bus = 0;}
 }
--- a/ext/mcpat/noc.h
+++ b/ext/mcpat/noc.h
@ -0,0 +1,75 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef NOC_H_
 #define NOC_H_
 #include "XML_Parse.h"
 #include "array.h"
 #include "basic_components.h"
 #include "interconnect.h"
 #include "logic.h"
 #include "parameter.h"
 #include "router.h"
 class NoC :public Component {
  public:
        ParseXML *XML;
        int  ithNoC;
        InputParameter interface_ip;
        double link_len;
        double executionTime;
        double scktRatio, chip_PR_overhead, macro_PR_overhead;
        Router * router;
        interconnect * link_bus;
        NoCParam  nocdynp;
        uca_org_t local_result;
        statsDef       tdp_stats;
        statsDef       rtp_stats;
        statsDef       stats_t;
        powerDef       power_t;
        Component      link_bus_tot_per_Router;
        bool link_bus_exist;
        bool router_exist;
        string name, link_name;
        double M_traffic_pattern;
        NoC(ParseXML *XML_interface, int ithNoC_, InputParameter* interface_ip_, double M_traffic_pattern_ = 0.6,double link_len_=0);
        void set_noc_param();
        void computeEnergy(bool is_tdp=true);
        void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
        void init_link_bus(double link_len_);
        void init_router();
        void computeEnergy_link_bus(bool is_tdp=true);
        void displayEnergy_link_bus(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
        ~NoC();
 };
 #endif /* NOC_H_ */
--- a/ext/mcpat/processor.cc
+++ b/ext/mcpat/processor.cc
@ -0,0 +1,839 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <fstream>
 #include <iostream>
 #include "XML_Parse.h"
 #include "array.h"
 #include "basic_circuit.h"
 #include "const.h"
 #include "parameter.h"
 #include "processor.h"
 #include "version.h"
 Processor::Processor(ParseXML *XML_interface)
 :XML(XML_interface),//TODO: using one global copy may have problems.
 mc(0),
 niu(0),
 pcie(0),
 flashcontroller(0)
 {
  /*
   *  placement and routing overhead is 10%, core scales worse than cache 40% is accumulated from 90 to 22nm
   *  There is no point to have heterogeneous memory controller on chip,
   *  thus McPAT only support homogeneous memory controllers.
   */
  int i;
  double pppm_t[4]    = {1,1,1,1};
  set_proc_param();
  if (procdynp.homoCore)
          numCore = procdynp.numCore==0? 0:1;
  else
          numCore = procdynp.numCore;
  if (procdynp.homoL2)
          numL2 = procdynp.numL2==0? 0:1;
  else
          numL2 = procdynp.numL2;
  if (XML->sys.Private_L2 && numCore != numL2)
  {
          cout<<"Number of private L2 does not match number of cores"<<endl;
      exit(0);
  }
  if (procdynp.homoL3)
          numL3 = procdynp.numL3==0? 0:1;
  else
          numL3 = procdynp.numL3;
  if (procdynp.homoNOC)
          numNOC = procdynp.numNOC==0? 0:1;
  else
          numNOC = procdynp.numNOC;
 //  if (!procdynp.homoNOC)
 //  {
 //	  cout<<"Current McPAT does not support heterogeneous NOC"<<endl;
 //      exit(0);
 //  }
  if (procdynp.homoL1Dir)
          numL1Dir = procdynp.numL1Dir==0? 0:1;
  else
          numL1Dir = procdynp.numL1Dir;
  if (procdynp.homoL2Dir)
          numL2Dir = procdynp.numL2Dir==0? 0:1;
  else
          numL2Dir = procdynp.numL2Dir;
  for (i = 0;i < numCore; i++)
  {
                  cores.push_back(new Core(XML,i, &interface_ip));
                  cores[i]->computeEnergy();
                  cores[i]->computeEnergy(false);
                  if (procdynp.homoCore){
                          core.area.set_area(core.area.get_area() + cores[i]->area.get_area()*procdynp.numCore);
                          set_pppm(pppm_t,cores[i]->clockRate*procdynp.numCore, procdynp.numCore,procdynp.numCore,procdynp.numCore);
                          core.power = core.power + cores[i]->power*pppm_t;
                          set_pppm(pppm_t,1/cores[i]->executionTime, procdynp.numCore,procdynp.numCore,procdynp.numCore);
                          core.rt_power = core.rt_power + cores[i]->rt_power*pppm_t;
                          area.set_area(area.get_area() + core.area.get_area());//placement and routing overhead is 10%, core scales worse than cache 40% is accumulated from 90 to 22nm
                          power = power  + core.power;
                          rt_power = rt_power  + core.rt_power;
                  }
                  else{
                          core.area.set_area(core.area.get_area() + cores[i]->area.get_area());
                          area.set_area(area.get_area() + cores[i]->area.get_area());//placement and routing overhead is 10%, core scales worse than cache 40% is accumulated from 90 to 22nm
                          set_pppm(pppm_t,cores[i]->clockRate, 1, 1, 1);
                          core.power = core.power + cores[i]->power*pppm_t;
                          power = power  + cores[i]->power*pppm_t;
                          set_pppm(pppm_t,1/cores[i]->executionTime, 1, 1, 1);
                          core.rt_power = core.rt_power + cores[i]->rt_power*pppm_t;
                          rt_power = rt_power  + cores[i]->rt_power*pppm_t;
                  }
  }
  if (!XML->sys.Private_L2)
  {
  if (numL2 >0)
          for (i = 0;i < numL2; i++)
          {
                  l2array.push_back(new SharedCache(XML,i, &interface_ip));
                  l2array[i]->computeEnergy();
                  l2array[i]->computeEnergy(false);
                  if (procdynp.homoL2){
                          l2.area.set_area(l2.area.get_area() + l2array[i]->area.get_area()*procdynp.numL2);
                          set_pppm(pppm_t,l2array[i]->cachep.clockRate*procdynp.numL2, procdynp.numL2,procdynp.numL2,procdynp.numL2);
                          l2.power = l2.power + l2array[i]->power*pppm_t;
                          set_pppm(pppm_t,1/l2array[i]->cachep.executionTime, procdynp.numL2,procdynp.numL2,procdynp.numL2);
                          l2.rt_power = l2.rt_power + l2array[i]->rt_power*pppm_t;
                          area.set_area(area.get_area() + l2.area.get_area());//placement and routing overhead is 10%, l2 scales worse than cache 40% is accumulated from 90 to 22nm
                          power = power  + l2.power;
                          rt_power = rt_power  + l2.rt_power;
                  }
                  else{
                          l2.area.set_area(l2.area.get_area() + l2array[i]->area.get_area());
                          area.set_area(area.get_area() + l2array[i]->area.get_area());//placement and routing overhead is 10%, l2 scales worse than cache 40% is accumulated from 90 to 22nm
                          set_pppm(pppm_t,l2array[i]->cachep.clockRate, 1, 1, 1);
                          l2.power = l2.power + l2array[i]->power*pppm_t;
                          power = power  + l2array[i]->power*pppm_t;;
                          set_pppm(pppm_t,1/l2array[i]->cachep.executionTime, 1, 1, 1);
                          l2.rt_power = l2.rt_power + l2array[i]->rt_power*pppm_t;
                          rt_power = rt_power  + l2array[i]->rt_power*pppm_t;
                  }
          }
  }
  if (numL3 >0)
          for (i = 0;i < numL3; i++)
          {
                  l3array.push_back(new SharedCache(XML,i, &interface_ip, L3));
                  l3array[i]->computeEnergy();
                  l3array[i]->computeEnergy(false);
                  if (procdynp.homoL3){
                          l3.area.set_area(l3.area.get_area() + l3array[i]->area.get_area()*procdynp.numL3);
                          set_pppm(pppm_t,l3array[i]->cachep.clockRate*procdynp.numL3, procdynp.numL3,procdynp.numL3,procdynp.numL3);
                          l3.power = l3.power + l3array[i]->power*pppm_t;
                          set_pppm(pppm_t,1/l3array[i]->cachep.executionTime, procdynp.numL3,procdynp.numL3,procdynp.numL3);
              l3.rt_power = l3.rt_power + l3array[i]->rt_power*pppm_t;
                          area.set_area(area.get_area() + l3.area.get_area());//placement and routing overhead is 10%, l3 scales worse than cache 40% is accumulated from 90 to 22nm
                          power = power  + l3.power;
                          rt_power = rt_power  + l3.rt_power;
                  }
                  else{
                          l3.area.set_area(l3.area.get_area() + l3array[i]->area.get_area());
                          area.set_area(area.get_area() + l3array[i]->area.get_area());//placement and routing overhead is 10%, l3 scales worse than cache 40% is accumulated from 90 to 22nm
                          set_pppm(pppm_t,l3array[i]->cachep.clockRate, 1, 1, 1);
                          l3.power = l3.power + l3array[i]->power*pppm_t;
                          power = power  + l3array[i]->power*pppm_t;
                          set_pppm(pppm_t,1/l3array[i]->cachep.executionTime, 1, 1, 1);
              l3.rt_power = l3.rt_power + l3array[i]->rt_power*pppm_t;
              rt_power = rt_power  + l3array[i]->rt_power*pppm_t;
                  }
          }
  if (numL1Dir >0)
          for (i = 0;i < numL1Dir; i++)
          {
                  l1dirarray.push_back(new SharedCache(XML,i, &interface_ip, L1Directory));
                  l1dirarray[i]->computeEnergy();
                  l1dirarray[i]->computeEnergy(false);
                  if (procdynp.homoL1Dir){
                          l1dir.area.set_area(l1dir.area.get_area() + l1dirarray[i]->area.get_area()*procdynp.numL1Dir);
                          set_pppm(pppm_t,l1dirarray[i]->cachep.clockRate*procdynp.numL1Dir, procdynp.numL1Dir,procdynp.numL1Dir,procdynp.numL1Dir);
                          l1dir.power = l1dir.power + l1dirarray[i]->power*pppm_t;
                          set_pppm(pppm_t,1/l1dirarray[i]->cachep.executionTime, procdynp.numL1Dir,procdynp.numL1Dir,procdynp.numL1Dir);
              l1dir.rt_power = l1dir.rt_power + l1dirarray[i]->rt_power*pppm_t;
                          area.set_area(area.get_area() + l1dir.area.get_area());//placement and routing overhead is 10%, l1dir scales worse than cache 40% is accumulated from 90 to 22nm
                          power = power  + l1dir.power;
                          rt_power = rt_power  + l1dir.rt_power;
                  }
                  else{
                          l1dir.area.set_area(l1dir.area.get_area() + l1dirarray[i]->area.get_area());
                          area.set_area(area.get_area() + l1dirarray[i]->area.get_area());
                          set_pppm(pppm_t,l1dirarray[i]->cachep.clockRate, 1, 1, 1);
                          l1dir.power = l1dir.power + l1dirarray[i]->power*pppm_t;
                          power = power  + l1dirarray[i]->power;
                          set_pppm(pppm_t,1/l1dirarray[i]->cachep.executionTime, 1, 1, 1);
              l1dir.rt_power = l1dir.rt_power + l1dirarray[i]->rt_power*pppm_t;
                          rt_power = rt_power  + l1dirarray[i]->rt_power;
                  }
          }
  if (numL2Dir >0)
          for (i = 0;i < numL2Dir; i++)
          {
                  l2dirarray.push_back(new SharedCache(XML,i, &interface_ip, L2Directory));
                  l2dirarray[i]->computeEnergy();
                  l2dirarray[i]->computeEnergy(false);
                  if (procdynp.homoL2Dir){
                          l2dir.area.set_area(l2dir.area.get_area() + l2dirarray[i]->area.get_area()*procdynp.numL2Dir);
                          set_pppm(pppm_t,l2dirarray[i]->cachep.clockRate*procdynp.numL2Dir, procdynp.numL2Dir,procdynp.numL2Dir,procdynp.numL2Dir);
                          l2dir.power = l2dir.power + l2dirarray[i]->power*pppm_t;
                          set_pppm(pppm_t,1/l2dirarray[i]->cachep.executionTime, procdynp.numL2Dir,procdynp.numL2Dir,procdynp.numL2Dir);
              l2dir.rt_power = l2dir.rt_power + l2dirarray[i]->rt_power*pppm_t;
                          area.set_area(area.get_area() + l2dir.area.get_area());//placement and routing overhead is 10%, l2dir scales worse than cache 40% is accumulated from 90 to 22nm
                          power = power  + l2dir.power;
                          rt_power = rt_power  + l2dir.rt_power;
                  }
                  else{
                          l2dir.area.set_area(l2dir.area.get_area() + l2dirarray[i]->area.get_area());
                          area.set_area(area.get_area() + l2dirarray[i]->area.get_area());
                          set_pppm(pppm_t,l2dirarray[i]->cachep.clockRate, 1, 1, 1);
                          l2dir.power = l2dir.power + l2dirarray[i]->power*pppm_t;
                          power = power  + l2dirarray[i]->power*pppm_t;
                          set_pppm(pppm_t,1/l2dirarray[i]->cachep.executionTime, 1, 1, 1);
              l2dir.rt_power = l2dir.rt_power + l2dirarray[i]->rt_power*pppm_t;
                          rt_power = rt_power  + l2dirarray[i]->rt_power*pppm_t;
                  }
          }
  if (XML->sys.mc.number_mcs >0 && XML->sys.mc.memory_channels_per_mc>0)
  {
          mc = new MemoryController(XML, &interface_ip, MC);
          mc->computeEnergy();
          mc->computeEnergy(false);
          mcs.area.set_area(mcs.area.get_area()+mc->area.get_area()*XML->sys.mc.number_mcs);
          area.set_area(area.get_area()+mc->area.get_area()*XML->sys.mc.number_mcs);
          set_pppm(pppm_t,XML->sys.mc.number_mcs*mc->mcp.clockRate, XML->sys.mc.number_mcs,XML->sys.mc.number_mcs,XML->sys.mc.number_mcs);
          mcs.power = mc->power*pppm_t;
          power = power  + mcs.power;
          set_pppm(pppm_t,1/mc->mcp.executionTime, XML->sys.mc.number_mcs,XML->sys.mc.number_mcs,XML->sys.mc.number_mcs);
          mcs.rt_power = mc->rt_power*pppm_t;
          rt_power = rt_power  + mcs.rt_power;
  }
  if (XML->sys.flashc.number_mcs >0 )//flash controller
  {
          flashcontroller = new FlashController(XML, &interface_ip);
          flashcontroller->computeEnergy();
          flashcontroller->computeEnergy(false);
          double number_fcs = flashcontroller->fcp.num_mcs;
          flashcontrollers.area.set_area(flashcontrollers.area.get_area()+flashcontroller->area.get_area()*number_fcs);
          area.set_area(area.get_area()+flashcontrollers.area.get_area());
          set_pppm(pppm_t,number_fcs, number_fcs ,number_fcs, number_fcs );
          flashcontrollers.power = flashcontroller->power*pppm_t;
          power = power  + flashcontrollers.power;
          set_pppm(pppm_t,number_fcs , number_fcs ,number_fcs ,number_fcs );
          flashcontrollers.rt_power = flashcontroller->rt_power*pppm_t;
          rt_power = rt_power  + flashcontrollers.rt_power;
  }
  if (XML->sys.niu.number_units >0)
  {
          niu = new NIUController(XML, &interface_ip);
          niu->computeEnergy();
          niu->computeEnergy(false);
          nius.area.set_area(nius.area.get_area()+niu->area.get_area()*XML->sys.niu.number_units);
          area.set_area(area.get_area()+niu->area.get_area()*XML->sys.niu.number_units);
          set_pppm(pppm_t,XML->sys.niu.number_units*niu->niup.clockRate, XML->sys.niu.number_units,XML->sys.niu.number_units,XML->sys.niu.number_units);
          nius.power = niu->power*pppm_t;
          power = power  + nius.power;
          set_pppm(pppm_t,XML->sys.niu.number_units*niu->niup.clockRate, XML->sys.niu.number_units,XML->sys.niu.number_units,XML->sys.niu.number_units);
          nius.rt_power = niu->rt_power*pppm_t;
          rt_power = rt_power  + nius.rt_power;
  }
  if (XML->sys.pcie.number_units >0 && XML->sys.pcie.num_channels >0)
  {
          pcie = new PCIeController(XML, &interface_ip);
          pcie->computeEnergy();
          pcie->computeEnergy(false);
          pcies.area.set_area(pcies.area.get_area()+pcie->area.get_area()*XML->sys.pcie.number_units);
          area.set_area(area.get_area()+pcie->area.get_area()*XML->sys.pcie.number_units);
          set_pppm(pppm_t,XML->sys.pcie.number_units*pcie->pciep.clockRate, XML->sys.pcie.number_units,XML->sys.pcie.number_units,XML->sys.pcie.number_units);
          pcies.power = pcie->power*pppm_t;
          power = power  + pcies.power;
          set_pppm(pppm_t,XML->sys.pcie.number_units*pcie->pciep.clockRate, XML->sys.pcie.number_units,XML->sys.pcie.number_units,XML->sys.pcie.number_units);
          pcies.rt_power = pcie->rt_power*pppm_t;
          rt_power = rt_power  + pcies.rt_power;
  }
  if (numNOC >0)
  {
          for (i = 0;i < numNOC; i++)
          {
                  if (XML->sys.NoC[i].type)
                  {//First add up area of routers if NoC is used
                          nocs.push_back(new NoC(XML,i, &interface_ip, 1));
                          if (procdynp.homoNOC)
                          {
                                  noc.area.set_area(noc.area.get_area() + nocs[i]->area.get_area()*procdynp.numNOC);
                                  area.set_area(area.get_area() + noc.area.get_area());
                          }
                          else
                          {
                                  noc.area.set_area(noc.area.get_area() + nocs[i]->area.get_area());
                                  area.set_area(area.get_area() + nocs[i]->area.get_area());
                          }
                  }
                  else
                  {//Bus based interconnect
                          nocs.push_back(new NoC(XML,i, &interface_ip, 1, sqrt(area.get_area()*XML->sys.NoC[i].chip_coverage)));
                          if (procdynp.homoNOC){
                                  noc.area.set_area(noc.area.get_area() + nocs[i]->area.get_area()*procdynp.numNOC);
                                  area.set_area(area.get_area() + noc.area.get_area());
                          }
                          else
                          {
                                  noc.area.set_area(noc.area.get_area() + nocs[i]->area.get_area());
                                  area.set_area(area.get_area() + nocs[i]->area.get_area());
                          }
                  }
          }
          /*
           * Compute global links associated with each NOC, if any. This must be done at the end (even after the NOC router part) since the total chip
           * area must be obtain to decide the link routing
           */
          for (i = 0;i < numNOC; i++)
          {
                  if (nocs[i]->nocdynp.has_global_link && XML->sys.NoC[i].type)
                  {
                          nocs[i]->init_link_bus(sqrt(area.get_area()*XML->sys.NoC[i].chip_coverage));//compute global links
                          if (procdynp.homoNOC)
                          {
                                  noc.area.set_area(noc.area.get_area() + nocs[i]->link_bus_tot_per_Router.area.get_area()
                                                  * nocs[i]->nocdynp.total_nodes
                                                  * procdynp.numNOC);
                                  area.set_area(area.get_area() + nocs[i]->link_bus_tot_per_Router.area.get_area()
                                                  * nocs[i]->nocdynp.total_nodes
                                                  * procdynp.numNOC);
                          }
                          else
                          {
                                  noc.area.set_area(noc.area.get_area() + nocs[i]->link_bus_tot_per_Router.area.get_area()
                                                  * nocs[i]->nocdynp.total_nodes);
                                  area.set_area(area.get_area() + nocs[i]->link_bus_tot_per_Router.area.get_area()
                                                  * nocs[i]->nocdynp.total_nodes);
                          }
                  }
          }
          //Compute energy of NoC (w or w/o links) or buses
          for (i = 0;i < numNOC; i++)
          {
                  nocs[i]->computeEnergy();
                  nocs[i]->computeEnergy(false);
                  if (procdynp.homoNOC){
                          set_pppm(pppm_t,procdynp.numNOC*nocs[i]->nocdynp.clockRate, procdynp.numNOC,procdynp.numNOC,procdynp.numNOC);
                          noc.power = noc.power + nocs[i]->power*pppm_t;
                          set_pppm(pppm_t,1/nocs[i]->nocdynp.executionTime, procdynp.numNOC,procdynp.numNOC,procdynp.numNOC);
                          noc.rt_power = noc.rt_power + nocs[i]->rt_power*pppm_t;
                          power = power  + noc.power;
                          rt_power = rt_power  + noc.rt_power;
                  }
                  else
                  {
                          set_pppm(pppm_t,nocs[i]->nocdynp.clockRate, 1, 1, 1);
                          noc.power = noc.power + nocs[i]->power*pppm_t;
                          power = power  + nocs[i]->power*pppm_t;
                          set_pppm(pppm_t,1/nocs[i]->nocdynp.executionTime, 1, 1, 1);
                          noc.rt_power = noc.rt_power + nocs[i]->rt_power*pppm_t;
                          rt_power = rt_power  + nocs[i]->rt_power*pppm_t;
                  }
          }
  }
 //  //clock power
 //  globalClock.init_wire_external(is_default, &interface_ip);
 //  globalClock.clk_area           =area*1e6; //change it from mm^2 to um^2
 //  globalClock.end_wiring_level   =5;//toplevel metal
 //  globalClock.start_wiring_level =5;//toplevel metal
 //  globalClock.l_ip.with_clock_grid=false;//global clock does not drive local final nodes
 //  globalClock.optimize_wire();
 }
 void Processor::displayDeviceType(int device_type_, uint32_t indent)
 {
        string indent_str(indent, ' ');
        switch ( device_type_ ) {
          case 0 :
                  cout <<indent_str<<"Device Type= "<<"ITRS high performance device type"<<endl;
            break;
          case 1 :
                  cout <<indent_str<<"Device Type= "<<"ITRS low standby power device type"<<endl;
            break;
          case 2 :
                  cout <<indent_str<<"Device Type= "<<"ITRS low operating power device type"<<endl;
            break;
          case 3 :
                  cout <<indent_str<<"Device Type= "<<"LP-DRAM device type"<<endl;
            break;
          case 4 :
                  cout <<indent_str<<"Device Type= "<<"COMM-DRAM device type"<<endl;
            break;
          default :
                  {
                          cout <<indent_str<<"Unknown Device Type"<<endl;
                          exit(0);
                  }
        }
 }
 void Processor::displayInterconnectType(int interconnect_type_, uint32_t indent)
 {
        string indent_str(indent, ' ');
        switch ( interconnect_type_ ) {
          case 0 :
                  cout <<indent_str<<"Interconnect metal projection= "<<"aggressive interconnect technology projection"<<endl;
            break;
          case 1 :
                  cout <<indent_str<<"Interconnect metal projection= "<<"conservative interconnect technology projection"<<endl;
            break;
          default :
                  {
                          cout <<indent_str<<"Unknown Interconnect Projection Type"<<endl;
                          exit(0);
                  }
        }
 }
 void Processor::displayEnergy(uint32_t indent, int plevel, bool is_tdp)
 {
        int i;
        bool long_channel = XML->sys.longer_channel_device;
        string indent_str(indent, ' ');
        string indent_str_next(indent+2, ' ');
        if (is_tdp)
        {
                if (plevel<5)
                {
                        cout<<"\nMcPAT (version "<< VER_MAJOR <<"."<< VER_MINOR
                                        << " of " << VER_UPDATE << ") results (current print level is "<< plevel
                        <<", please increase print level to see the details in components): "<<endl;
                }
                else
                {
                        cout<<"\nMcPAT (version "<< VER_MAJOR <<"."<< VER_MINOR
                                                                << " of " << VER_UPDATE << ") results  (current print level is 5)"<< endl;
                }
                cout <<"*****************************************************************************************"<<endl;
                cout <<indent_str<<"Technology "<<XML->sys.core_tech_node<<" nm"<<endl;
                //cout <<indent_str<<"Device Type= "<<XML->sys.device_type<<endl;
                if (long_channel)
                        cout <<indent_str<<"Using Long Channel Devices When Appropriate"<<endl;
                //cout <<indent_str<<"Interconnect metal projection= "<<XML->sys.interconnect_projection_type<<endl;
                displayInterconnectType(XML->sys.interconnect_projection_type, indent);
                cout <<indent_str<<"Core clock Rate(MHz) "<<XML->sys.core[0].clock_rate<<endl;
        cout <<endl;
                cout <<"*****************************************************************************************"<<endl;
                cout <<"Processor: "<<endl;
                cout << indent_str << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
                cout << indent_str << "Peak Power = " << power.readOp.dynamic +
                        (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) + power.readOp.gate_leakage <<" W" << endl;
                cout << indent_str << "Total Leakage = " <<
                        (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) + power.readOp.gate_leakage <<" W" << endl;
                cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic << " W" << endl;
                cout << indent_str << "Subthreshold Leakage = " << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
                //cout << indent_str << "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
                cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
                cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic << " W" << endl;
                cout <<endl;
                if (numCore >0){
                cout <<indent_str<<"Total Cores: "<<XML->sys.number_of_cores << " cores "<<endl;
                displayDeviceType(XML->sys.device_type,indent);
                cout << indent_str_next << "Area = " << core.area.get_area()*1e-6<< " mm^2" << endl;
                cout << indent_str_next << "Peak Dynamic = " << core.power.readOp.dynamic << " W" << endl;
                cout << indent_str_next << "Subthreshold Leakage = "
                        << (long_channel? core.power.readOp.longer_channel_leakage:core.power.readOp.leakage) <<" W" << endl;
                //cout << indent_str_next << "Subthreshold Leakage = " << core.power.readOp.longer_channel_leakage <<" W" << endl;
                cout << indent_str_next << "Gate Leakage = " << core.power.readOp.gate_leakage << " W" << endl;
                cout << indent_str_next << "Runtime Dynamic = " << core.rt_power.readOp.dynamic << " W" << endl;
                cout <<endl;
                }
                if (!XML->sys.Private_L2)
                {
                        if (numL2 >0){
                                cout <<indent_str<<"Total L2s: "<<endl;
                                displayDeviceType(XML->sys.L2[0].device_type,indent);
                                cout << indent_str_next << "Area = " << l2.area.get_area()*1e-6<< " mm^2" << endl;
                                cout << indent_str_next << "Peak Dynamic = " << l2.power.readOp.dynamic << " W" << endl;
                                cout << indent_str_next << "Subthreshold Leakage = "
                                << (long_channel? l2.power.readOp.longer_channel_leakage:l2.power.readOp.leakage) <<" W" << endl;
                                //cout << indent_str_next << "Subthreshold Leakage = " << l2.power.readOp.longer_channel_leakage <<" W" << endl;
                                cout << indent_str_next << "Gate Leakage = " << l2.power.readOp.gate_leakage << " W" << endl;
                                cout << indent_str_next << "Runtime Dynamic = " << l2.rt_power.readOp.dynamic << " W" << endl;
                                cout <<endl;
                        }
                }
                if (numL3 >0){
                        cout <<indent_str<<"Total L3s: "<<endl;
                        displayDeviceType(XML->sys.L3[0].device_type, indent);
                        cout << indent_str_next << "Area = " << l3.area.get_area()*1e-6<< " mm^2" << endl;
                        cout << indent_str_next << "Peak Dynamic = " << l3.power.readOp.dynamic << " W" << endl;
                        cout << indent_str_next << "Subthreshold Leakage = "
                                << (long_channel? l3.power.readOp.longer_channel_leakage:l3.power.readOp.leakage) <<" W" << endl;
                        //cout << indent_str_next << "Subthreshold Leakage = " << l3.power.readOp.longer_channel_leakage <<" W" << endl;
                        cout << indent_str_next << "Gate Leakage = " << l3.power.readOp.gate_leakage << " W" << endl;
                        cout << indent_str_next << "Runtime Dynamic = " << l3.rt_power.readOp.dynamic << " W" << endl;
                        cout <<endl;
                }
                if (numL1Dir >0){
                        cout <<indent_str<<"Total First Level Directory: "<<endl;
                        displayDeviceType(XML->sys.L1Directory[0].device_type, indent);
                        cout << indent_str_next << "Area = " << l1dir.area.get_area()*1e-6<< " mm^2" << endl;
                        cout << indent_str_next << "Peak Dynamic = " << l1dir.power.readOp.dynamic << " W" << endl;
                        cout << indent_str_next << "Subthreshold Leakage = "
                                << (long_channel? l1dir.power.readOp.longer_channel_leakage:l1dir.power.readOp.leakage) <<" W" << endl;
                        //cout << indent_str_next << "Subthreshold Leakage = " << l1dir.power.readOp.longer_channel_leakage <<" W" << endl;
                        cout << indent_str_next << "Gate Leakage = " << l1dir.power.readOp.gate_leakage << " W" << endl;
                        cout << indent_str_next << "Runtime Dynamic = " << l1dir.rt_power.readOp.dynamic << " W" << endl;
                        cout <<endl;
                }
                if (numL2Dir >0){
                        cout <<indent_str<<"Total First Level Directory: "<<endl;
                        displayDeviceType(XML->sys.L1Directory[0].device_type, indent);
                        cout << indent_str_next << "Area = " << l2dir.area.get_area()*1e-6<< " mm^2" << endl;
                        cout << indent_str_next << "Peak Dynamic = " << l2dir.power.readOp.dynamic << " W" << endl;
                        cout << indent_str_next << "Subthreshold Leakage = "
                                << (long_channel? l2dir.power.readOp.longer_channel_leakage:l2dir.power.readOp.leakage) <<" W" << endl;
                        //cout << indent_str_next << "Subthreshold Leakage = " << l2dir.power.readOp.longer_channel_leakage <<" W" << endl;
                        cout << indent_str_next << "Gate Leakage = " << l2dir.power.readOp.gate_leakage << " W" << endl;
                        cout << indent_str_next << "Runtime Dynamic = " << l2dir.rt_power.readOp.dynamic << " W" << endl;
                        cout <<endl;
                }
                if (numNOC >0){
                        cout <<indent_str<<"Total NoCs (Network/Bus): "<<endl;
                        displayDeviceType(XML->sys.device_type, indent);
                        cout << indent_str_next << "Area = " << noc.area.get_area()*1e-6<< " mm^2" << endl;
                        cout << indent_str_next << "Peak Dynamic = " << noc.power.readOp.dynamic << " W" << endl;
                        cout << indent_str_next << "Subthreshold Leakage = "
                                << (long_channel? noc.power.readOp.longer_channel_leakage:noc.power.readOp.leakage) <<" W" << endl;
                        //cout << indent_str_next << "Subthreshold Leakage = " << noc.power.readOp.longer_channel_leakage  <<" W" << endl;
                        cout << indent_str_next << "Gate Leakage = " << noc.power.readOp.gate_leakage << " W" << endl;
                        cout << indent_str_next << "Runtime Dynamic = " << noc.rt_power.readOp.dynamic << " W" << endl;
                        cout <<endl;
                }
                if (XML->sys.mc.number_mcs >0 && XML->sys.mc.memory_channels_per_mc>0)
                {
                        cout <<indent_str<<"Total MCs: "<<XML->sys.mc.number_mcs << " Memory Controllers "<<endl;
                        displayDeviceType(XML->sys.device_type, indent);
                        cout << indent_str_next << "Area = " << mcs.area.get_area()*1e-6<< " mm^2" << endl;
                        cout << indent_str_next << "Peak Dynamic = " << mcs.power.readOp.dynamic << " W" << endl;
                        cout << indent_str_next << "Subthreshold Leakage = "
                                << (long_channel? mcs.power.readOp.longer_channel_leakage:mcs.power.readOp.leakage)  <<" W" << endl;
                        cout << indent_str_next << "Gate Leakage = " << mcs.power.readOp.gate_leakage << " W" << endl;
                        cout << indent_str_next << "Runtime Dynamic = " << mcs.rt_power.readOp.dynamic << " W" << endl;
                        cout <<endl;
                }
                if (XML->sys.flashc.number_mcs >0)
                {
                        cout <<indent_str<<"Total Flash/SSD Controllers: "<<flashcontroller->fcp.num_mcs << " Flash/SSD Controllers "<<endl;
                        displayDeviceType(XML->sys.device_type, indent);
                        cout << indent_str_next << "Area = " << flashcontrollers.area.get_area()*1e-6<< " mm^2" << endl;
                        cout << indent_str_next << "Peak Dynamic = " << flashcontrollers.power.readOp.dynamic << " W" << endl;
                        cout << indent_str_next << "Subthreshold Leakage = "
                                << (long_channel? flashcontrollers.power.readOp.longer_channel_leakage:flashcontrollers.power.readOp.leakage)  <<" W" << endl;
                        cout << indent_str_next << "Gate Leakage = " << flashcontrollers.power.readOp.gate_leakage << " W" << endl;
                        cout << indent_str_next << "Runtime Dynamic = " << flashcontrollers.rt_power.readOp.dynamic << " W" << endl;
                        cout <<endl;
                }
                if (XML->sys.niu.number_units >0 )
                {
                        cout <<indent_str<<"Total NIUs: "<<niu->niup.num_units << " Network Interface Units "<<endl;
                        displayDeviceType(XML->sys.device_type, indent);
                        cout << indent_str_next << "Area = " << nius.area.get_area()*1e-6<< " mm^2" << endl;
                        cout << indent_str_next << "Peak Dynamic = " << nius.power.readOp.dynamic << " W" << endl;
                        cout << indent_str_next << "Subthreshold Leakage = "
                                << (long_channel? nius.power.readOp.longer_channel_leakage:nius.power.readOp.leakage)  <<" W" << endl;
                        cout << indent_str_next << "Gate Leakage = " << nius.power.readOp.gate_leakage << " W" << endl;
                        cout << indent_str_next << "Runtime Dynamic = " << nius.rt_power.readOp.dynamic << " W" << endl;
                        cout <<endl;
                }
                if (XML->sys.pcie.number_units >0 && XML->sys.pcie.num_channels>0)
                                {
                                        cout <<indent_str<<"Total PCIes: "<<pcie->pciep.num_units << " PCIe Controllers "<<endl;
                                        displayDeviceType(XML->sys.device_type, indent);
                                        cout << indent_str_next << "Area = " << pcies.area.get_area()*1e-6<< " mm^2" << endl;
                                        cout << indent_str_next << "Peak Dynamic = " << pcies.power.readOp.dynamic << " W" << endl;
                                        cout << indent_str_next << "Subthreshold Leakage = "
                                                << (long_channel? pcies.power.readOp.longer_channel_leakage:pcies.power.readOp.leakage)  <<" W" << endl;
                                        cout << indent_str_next << "Gate Leakage = " << pcies.power.readOp.gate_leakage << " W" << endl;
                                        cout << indent_str_next << "Runtime Dynamic = " << pcies.rt_power.readOp.dynamic << " W" << endl;
                                        cout <<endl;
                                }
                cout <<"*****************************************************************************************"<<endl;
                if (plevel >1)
                {
                        for (i = 0;i < numCore; i++)
                        {
                                cores[i]->displayEnergy(indent+4,plevel,is_tdp);
                                cout <<"*****************************************************************************************"<<endl;
                        }
                        if (!XML->sys.Private_L2)
                        {
                                for (i = 0;i < numL2; i++)
                                {
                                        l2array[i]->displayEnergy(indent+4,is_tdp);
                                        cout <<"*****************************************************************************************"<<endl;
                                }
                        }
                        for (i = 0;i < numL3; i++)
                        {
                                l3array[i]->displayEnergy(indent+4,is_tdp);
                                cout <<"*****************************************************************************************"<<endl;
                        }
                        for (i = 0;i < numL1Dir; i++)
                        {
                                l1dirarray[i]->displayEnergy(indent+4,is_tdp);
                                cout <<"*****************************************************************************************"<<endl;
                        }
                        for (i = 0;i < numL2Dir; i++)
                        {
                                l2dirarray[i]->displayEnergy(indent+4,is_tdp);
                                cout <<"*****************************************************************************************"<<endl;
                        }
                        if (XML->sys.mc.number_mcs >0 && XML->sys.mc.memory_channels_per_mc>0)
                        {
                                mc->displayEnergy(indent+4,is_tdp);
                                cout <<"*****************************************************************************************"<<endl;
                        }
                        if (XML->sys.flashc.number_mcs >0 && XML->sys.flashc.memory_channels_per_mc>0)
                        {
                                flashcontroller->displayEnergy(indent+4,is_tdp);
                                cout <<"*****************************************************************************************"<<endl;
                        }
                        if (XML->sys.niu.number_units >0 )
                        {
                                niu->displayEnergy(indent+4,is_tdp);
                                cout <<"*****************************************************************************************"<<endl;
                        }
                        if (XML->sys.pcie.number_units >0 && XML->sys.pcie.num_channels>0)
                        {
                                pcie->displayEnergy(indent+4,is_tdp);
                                cout <<"*****************************************************************************************"<<endl;
                        }
                        for (i = 0;i < numNOC; i++)
                        {
                                nocs[i]->displayEnergy(indent+4,plevel,is_tdp);
                                cout <<"*****************************************************************************************"<<endl;
                        }
                }
        }
        else
        {
        }
 }
 void Processor::set_proc_param()
 {
        bool debug = false;
        procdynp.homoCore = bool(debug?1:XML->sys.homogeneous_cores);
        procdynp.homoL2   = bool(debug?1:XML->sys.homogeneous_L2s);
        procdynp.homoL3   = bool(debug?1:XML->sys.homogeneous_L3s);
        procdynp.homoNOC  = bool(debug?1:XML->sys.homogeneous_NoCs);
        procdynp.homoL1Dir  = bool(debug?1:XML->sys.homogeneous_L1Directories);
        procdynp.homoL2Dir  = bool(debug?1:XML->sys.homogeneous_L2Directories);
        procdynp.numCore = XML->sys.number_of_cores;
        procdynp.numL2   = XML->sys.number_of_L2s;
        procdynp.numL3   = XML->sys.number_of_L3s;
        procdynp.numNOC  = XML->sys.number_of_NoCs;
        procdynp.numL1Dir  = XML->sys.number_of_L1Directories;
        procdynp.numL2Dir  = XML->sys.number_of_L2Directories;
        procdynp.numMC = XML->sys.mc.number_mcs;
        procdynp.numMCChannel = XML->sys.mc.memory_channels_per_mc;
 //	if (procdynp.numCore<1)
 //	{
 //		cout<<" The target processor should at least have one core on chip." <<endl;
 //		exit(0);
 //	}
        //  if (numNOCs<0 || numNOCs>2)
        //    {
        //  	  cout <<"number of NOCs must be 1 (only global NOCs) or 2 (both global and local NOCs)"<<endl;
        //  	  exit(0);
        //    }
        /* Basic parameters*/
        interface_ip.data_arr_ram_cell_tech_type    = debug?0:XML->sys.device_type;
        interface_ip.data_arr_peri_global_tech_type = debug?0:XML->sys.device_type;
        interface_ip.tag_arr_ram_cell_tech_type     = debug?0:XML->sys.device_type;
        interface_ip.tag_arr_peri_global_tech_type  = debug?0:XML->sys.device_type;
        interface_ip.ic_proj_type     = debug?0:XML->sys.interconnect_projection_type;
        interface_ip.delay_wt                = 100;//Fixed number, make sure timing can be satisfied.
        interface_ip.area_wt                 = 0;//Fixed number, This is used to exhaustive search for individual components.
        interface_ip.dynamic_power_wt        = 100;//Fixed number, This is used to exhaustive search for individual components.
        interface_ip.leakage_power_wt        = 0;
        interface_ip.cycle_time_wt           = 0;
        interface_ip.delay_dev                = 10000;//Fixed number, make sure timing can be satisfied.
        interface_ip.area_dev                 = 10000;//Fixed number, This is used to exhaustive search for individual components.
        interface_ip.dynamic_power_dev        = 10000;//Fixed number, This is used to exhaustive search for individual components.
        interface_ip.leakage_power_dev        = 10000;
        interface_ip.cycle_time_dev           = 10000;
        interface_ip.ed                       = 2;
        interface_ip.burst_len      = 1;//parameters are fixed for processor section, since memory is processed separately
        interface_ip.int_prefetch_w = 1;
        interface_ip.page_sz_bits   = 0;
        interface_ip.temp = debug?360: XML->sys.temperature;
        interface_ip.F_sz_nm         = debug?90:XML->sys.core_tech_node;//XML->sys.core_tech_node;
        interface_ip.F_sz_um         = interface_ip.F_sz_nm / 1000;
        //***********This section of code does not have real meaning, they are just to ensure all data will have initial value to prevent errors.
        //They will be overridden  during each components initialization
        interface_ip.cache_sz            =64;
        interface_ip.line_sz             = 1;
        interface_ip.assoc               = 1;
        interface_ip.nbanks              = 1;
        interface_ip.out_w               = interface_ip.line_sz*8;
        interface_ip.specific_tag        = 1;
        interface_ip.tag_w               = 64;
        interface_ip.access_mode         = 2;
        interface_ip.obj_func_dyn_energy = 0;
        interface_ip.obj_func_dyn_power  = 0;
        interface_ip.obj_func_leak_power = 0;
        interface_ip.obj_func_cycle_t    = 1;
        interface_ip.is_main_mem     = false;
        interface_ip.rpters_in_htree = true ;
        interface_ip.ver_htree_wires_over_array = 0;
        interface_ip.broadcast_addr_din_over_ver_htrees = 0;
        interface_ip.num_rw_ports        = 1;
        interface_ip.num_rd_ports        = 0;
        interface_ip.num_wr_ports        = 0;
        interface_ip.num_se_rd_ports     = 0;
        interface_ip.num_search_ports    = 1;
        interface_ip.nuca                = 0;
        interface_ip.nuca_bank_count     = 0;
        interface_ip.is_cache            =true;
        interface_ip.pure_ram            =false;
        interface_ip.pure_cam            =false;
        interface_ip.force_cache_config  =false;
        if (XML->sys.Embedded)
                {
                interface_ip.wt                  =Global_30;
                interface_ip.wire_is_mat_type = 0;
                interface_ip.wire_os_mat_type = 0;
                }
        else
                {
                interface_ip.wt                  =Global;
                interface_ip.wire_is_mat_type = 2;
                interface_ip.wire_os_mat_type = 2;
                }
        interface_ip.force_wiretype      = false;
        interface_ip.print_detail        = 1;
        interface_ip.add_ecc_b_          =true;
 }
 Processor::~Processor(){
        while (!cores.empty())
        {
                delete cores.back();
                cores.pop_back();
        }
        while (!l2array.empty())
        {
                delete l2array.back();
                l2array.pop_back();
        }
        while (!l3array.empty())
        {
                delete l3array.back();
                l3array.pop_back();
        }
        while (!nocs.empty())
        {
                delete nocs.back();
                nocs.pop_back();
        }
        if (!mc)
        {
                delete mc;
        }
        if (!niu)
        {
                delete niu;
        }
        if (!pcie)
        {
                delete pcie;
        }
        if (!flashcontroller)
        {
                delete flashcontroller;
        }
 };
--- a/ext/mcpat/processor.h
+++ b/ext/mcpat/processor.h
@ -0,0 +1,79 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef PROCESSOR_H_
 #define PROCESSOR_H_
 #include <vector>
 #include "XML_Parse.h"
 #include "arbiter.h"
 #include "area.h"
 #include "array.h"
 #include "basic_components.h"
 #include "core.h"
 #include "decoder.h"
 #include "iocontrollers.h"
 #include "memoryctrl.h"
 #include "noc.h"
 #include "parameter.h"
 #include "router.h"
 #include "sharedcache.h"
 class Processor : public Component
 {
  public:
        ParseXML *XML;
        vector<Core *> cores;
    vector<SharedCache *> l2array;
    vector<SharedCache *> l3array;
    vector<SharedCache *> l1dirarray;
    vector<SharedCache *> l2dirarray;
    vector<NoC *>  nocs;
    MemoryController * mc;
    NIUController    * niu;
    PCIeController   * pcie;
    FlashController  * flashcontroller;
    InputParameter interface_ip;
    ProcParam procdynp;
    //wire	globalInterconnect;
    //clock_network globalClock;
    Component core, l2, l3, l1dir, l2dir, noc, mcs, cc, nius, pcies,flashcontrollers;
    int  numCore, numL2, numL3, numNOC, numL1Dir, numL2Dir;
    Processor(ParseXML *XML_interface);
    void compute();
    void set_proc_param();
    void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
    void displayDeviceType(int device_type_, uint32_t indent = 0);
    void displayInterconnectType(int interconnect_type_, uint32_t indent = 0);
    ~Processor();
 };
 #endif /* PROCESSOR_H_ */
--- a/ext/mcpat/results/A9_2000
+++ b/ext/mcpat/results/A9_2000
@ -0,0 +1,321 @@
 McPAT (version 0.8 of Aug, 2010) is computing the target processor...
 Warning: Branch Target Buffer array structure cannot satisfy latency constraint.
 McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
 *****************************************************************************************
  Technology 40 nm
  Using Long Channel Devices When Appropriate
  Interconnect metal projection= conservative interconnect technology projection
  Core clock Rate(MHz) 2000
 *****************************************************************************************
 Processor: 
  Area = 5.83937 mm^2
  Peak Power = 1.32283 W
  Total Leakage = 0.182558 W
  Peak Dynamic = 1.14027 W
  Subthreshold Leakage = 0.0869601 W
  Gate Leakage = 0.095598 W
  Runtime Dynamic = 2.86361 W
  Total Cores: 
  Device Type= ITRS low operating power device type
    Area = 5.33485 mm^2
    Peak Dynamic = 1.07823 W
    Subthreshold Leakage = 0.0827641 W
    Gate Leakage = 0.0887315 W
    Runtime Dynamic = 0.975395 W
  Total First Level Directory: 
  Device Type= ITRS low operating power device type
    Area = 0.489711 mm^2
    Peak Dynamic = 0.0449752 W
    Subthreshold Leakage = 0.00397708 W
    Gate Leakage = 0.00655632 W
    Runtime Dynamic = 1.80289 W
  Total NoCs (Network/Bus): 
  Device Type= ITRS low operating power device type
    Area = 0.0148119 mm^2
    Peak Dynamic = 0.0170648 W
    Subthreshold Leakage = 0.000218992 W
    Gate Leakage = 0.000310207 W
    Runtime Dynamic = 0.0853239 W
 *****************************************************************************************
 Core:
      Area = 2.66742 mm^2
      Peak Dynamic = 0.539116 W
      Subthreshold Leakage = 0.041382 W
      Gate Leakage = 0.0443657 W
      Runtime Dynamic = 0.975395 W
      Instruction Fetch Unit:
        Area = 0.565848 mm^2
        Peak Dynamic = 0.184724 W
        Subthreshold Leakage = 0.00572394 W
        Gate Leakage = 0.00380598 W
        Runtime Dynamic = 0.283222 W
          Instruction Cache:
            Area = 0.235613 mm^2
            Peak Dynamic = 0.0310428 W
            Subthreshold Leakage = 0.00309635 W
            Gate Leakage = 0.00216385 W
            Runtime Dynamic = 0.0461626 W
          Branch Target Buffer:
            Area = 0.251259 mm^2
            Peak Dynamic = 0.0174433 W
            Subthreshold Leakage = 0.00170231 W
            Gate Leakage = 0.000908123 W
            Runtime Dynamic = 0.0697733 W
          Branch Predictor:
            Area = 0.064441 mm^2
            Peak Dynamic = 0.00815792 W
            Subthreshold Leakage = 0.00070444 W
            Gate Leakage = 0.000477387 W
            Runtime Dynamic = 0.0113878 W
              Global Predictor:
                Area = 0.0313969 mm^2
                Peak Dynamic = 0.00374527 W
                Subthreshold Leakage = 0.00034631 W
                Gate Leakage = 0.000233555 W
                Runtime Dynamic = 0.00545806 W
              Local Predictor:
                Area = 0.000711939 mm^2
                Peak Dynamic = 0.000301014 W
                Subthreshold Leakage = 6.13457e-06 W
                Gate Leakage = 5.63471e-06 W
                Runtime Dynamic = 0.000471566 W
                Area = 0.000650815 mm^2
                Peak Dynamic = 0.000230123 W
                Subthreshold Leakage = 5.7769e-06 W
                Gate Leakage = 4.75075e-06 W
                Runtime Dynamic = 0.000354988 W
              Chooser:
                Area = 0.0313969 mm^2
                Peak Dynamic = 0.00374527 W
                Subthreshold Leakage = 0.00034631 W
                Gate Leakage = 0.000233555 W
                Runtime Dynamic = 0.00545806 W
              RAS:
                Area = 0.000996272 mm^2
                Peak Dynamic = 0.000366372 W
                Subthreshold Leakage = 5.68653e-06 W
                Gate Leakage = 4.64147e-06 W
                Runtime Dynamic = 6.23994e-08 W
          Instruction Buffer:
            Area = 0.00820192 mm^2
            Peak Dynamic = 0.0669878 W
            Subthreshold Leakage = 6.33536e-05 W
            Gate Leakage = 4.34841e-05 W
            Runtime Dynamic = 0.0382787 W
          Instruction Decoder:
            Area = 0.00468731 mm^2
            Peak Dynamic = 0.05881 W
            Subthreshold Leakage = 0.000127696 W
            Gate Leakage = 0.000115494 W
            Runtime Dynamic = 0.11762 W
      Renaming Unit:
        Area = 0.0903068 mm^2
        Peak Dynamic = 0.0451514 W
        Subthreshold Leakage = 0.000345688 W
        Gate Leakage = 0.00032022 W
        Runtime Dynamic = 0.0731287 W
          Int Front End RAT:
            Area = 0.0543672 mm^2
            Peak Dynamic = 0.0237617 W
            Subthreshold Leakage = 0.000175223 W
            Gate Leakage = 0.000121525 W
            Runtime Dynamic = 0.0475234 W
          FP Front End RAT:
            Area = 0.0185325 mm^2
            Peak Dynamic = 0.00949419 W
            Subthreshold Leakage = 0.000100325 W
            Gate Leakage = 6.76251e-05 W
            Runtime Dynamic = 0.00949419 W
          Free List:
            Area = 0.00599955 mm^2
            Peak Dynamic = 0.00225065 W
            Subthreshold Leakage = 1.24363e-05 W
            Gate Leakage = 1.00844e-05 W
            Runtime Dynamic = 0.0090026 W
          Int Retire RAT: 
            Area = 0.00605969 mm^2
            Peak Dynamic = 0.00448392 W
            Subthreshold Leakage = 1.33231e-05 W
            Gate Leakage = 1.16235e-05 W
            Runtime Dynamic = 0.00448392 W
          FP Retire RAT:
            Area = 0.000650815 mm^2
            Peak Dynamic = 0.00067334 W
            Subthreshold Leakage = 5.7769e-06 W
            Gate Leakage = 4.75075e-06 W
            Runtime Dynamic = 0.00067334 W
          FP Free List:
            Area = 0.00305098 mm^2
            Peak Dynamic = 0.00195124 W
            Subthreshold Leakage = 8.81712e-06 W
            Gate Leakage = 6.96054e-06 W
            Runtime Dynamic = 0.00195124 W
      Load Store Unit:
        Area = 0.274913 mm^2
        Peak Dynamic = 0.0347482 W
        Subthreshold Leakage = 0.0032012 W
        Gate Leakage = 0.00235752 W
        Runtime Dynamic = 0.195304 W
          Data Cache:
            Area = 0.240878 mm^2
            Peak Dynamic = 0.0293665 W
            Subthreshold Leakage = 0.00312878 W
            Gate Leakage = 0.00220794 W
            Runtime Dynamic = 0.19026 W
          StoreQ:
            Area = 0.00754674 mm^2
            Peak Dynamic = 0.00358087 W
            Subthreshold Leakage = 4.2633e-05 W
            Gate Leakage = 5.19212e-05 W
            Runtime Dynamic = 0.00504348 W
      Memory Management Unit:
        Area = 0.021508 mm^2
        Peak Dynamic = 0.0127337 W
        Subthreshold Leakage = 0.000210621 W
        Gate Leakage = 0.000290666 W
        Runtime Dynamic = 0.037071 W
          Itlb:
            Area = 0.00993091 mm^2
            Peak Dynamic = 0.00617846 W
            Subthreshold Leakage = 9.04168e-05 W
            Gate Leakage = 9.65082e-05 W
            Runtime Dynamic = 0.012357 W
          Dtlb:
            Area = 0.00993091 mm^2
            Peak Dynamic = 0.00438671 W
            Subthreshold Leakage = 9.04168e-05 W
            Gate Leakage = 9.65082e-05 W
            Runtime Dynamic = 0.0247139 W
      Execution Unit:
        Area = 1.65498 mm^2
        Peak Dynamic = 0.261758 W
        Subthreshold Leakage = 0.0305522 W
        Gate Leakage = 0.0360036 W
        Runtime Dynamic = 0.386669 W
          Register Files:
            Area = 0.203203 mm^2
            Peak Dynamic = 0.0763282 W
            Subthreshold Leakage = 0.000197046 W
            Gate Leakage = 0.00016338 W
            Runtime Dynamic = 0.0386066 W
              Integer RF:
                Area = 0.146073 mm^2
                Peak Dynamic = 0.0763282 W
                Subthreshold Leakage = 0.000120303 W
                Gate Leakage = 9.97867e-05 W
                Runtime Dynamic = 0.0345689 W
              Floating Point RF:
                Area = 0.05713 mm^2
                Peak Dynamic = 0 W
                Subthreshold Leakage = 7.67427e-05 W
                Gate Leakage = 6.35938e-05 W
                Runtime Dynamic = 0.00403765 W
          Instruction Scheduler:
            Area = 0.0582889 mm^2
            Peak Dynamic = 0.0522571 W
            Subthreshold Leakage = 0.000128698 W
            Gate Leakage = 0.000185714 W
            Runtime Dynamic = 0.0787473 W
              Instruction Window:
                Area = 0.053925 mm^2
                Peak Dynamic = 0.0445895 W
                Subthreshold Leakage = 9.52936e-05 W
                Gate Leakage = 0.000130718 W
                Runtime Dynamic = 0.0602231 W
              FP Instruction Window:
                Area = 0.00436388 mm^2
                Peak Dynamic = 0.00766759 W
                Subthreshold Leakage = 3.34043e-05 W
                Gate Leakage = 5.49962e-05 W
                Runtime Dynamic = 0.0185242 W
          Integer ALUs (Count: 3 ):
            Area = 0.312404 mm^2
            Peak Dynamic = 0.0283684 W
            Subthreshold Leakage = 0.0140724 W
            Gate Leakage = 0.0165703 W
            Runtime Dynamic = 0.0373268 W
          Floating Point Units (FPUs) (Count: 1 ):
            Area = 0.971259 mm^2
            Peak Dynamic = 0 W
            Subthreshold Leakage = 0.0109377 W
            Gate Leakage = 0.0128792 W
            Runtime Dynamic = 0.0373268 W
          Complex ALUs (Mul/Div) (Count: 1 ):
            Area = 0.104135 mm^2
            Peak Dynamic = 0.0204053 W
            Subthreshold Leakage = 0.00469079 W
            Gate Leakage = 0.00552345 W
            Runtime Dynamic = 0.049769 W
          Results Broadcast Bus:
            Area Overhead = 0.00404385 mm^2
            Peak Dynamic = 0.0824719 W
            Subthreshold Leakage = 0.000495836 W
            Gate Leakage = 0.000583852 W
            Runtime Dynamic = 0.144892 W
 *****************************************************************************************
 First Level Directory
      Area = 0.244856 mm^2
      Peak Dynamic = 0.0224876 W
      Subthreshold Leakage = 0.00198854 W
      Gate Leakage = 0.00327816 W
      Runtime Dynamic = 1.80289 W
 *****************************************************************************************
 BUSES
      Area = 0.0148119 mm^2
      Peak Dynamic = 0.0170648 W
      Subthreshold Leakage = 0.000218992 W
      Gate Leakage = 0.000310207 W
      Runtime Dynamic = 0.0853239 W
      Bus: 
        Area = 0.0148119 mm^2
        Peak Dynamic = 0.0170648 W
        Subthreshold Leakage = 0.000218992 W
        Gate Leakage = 0.000310207 W
        Runtime Dynamic = 0.0853239 W
 *****************************************************************************************
--- a/ext/mcpat/results/A9_2000_withIOC
+++ b/ext/mcpat/results/A9_2000_withIOC
@ -0,0 +1,410 @@
 McPAT (version 0.8 of Aug, 2010) is computing the target processor...
 Warning: Branch Target Buffer array structure cannot satisfy latency constraint.
 SerDer_dyn 0.00216115
 ctrl_dyn 0.0278216
 ctrl_dyn 6.14856e-11
 SerDer_dyn 1.54368e-11
 McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
 *****************************************************************************************
  Technology 40 nm
  Using Long Channel Devices When Appropriate
  Interconnect metal projection= conservative interconnect technology projection
  Core clock Rate(MHz) 2000
 *****************************************************************************************
 Processor: 
  Area = 7.05775 mm^2
  Peak Power = 2.06734 W
  Total Leakage = 0.204814 W
  Peak Dynamic = 1.86253 W
  Subthreshold Leakage = 0.0916805 W
  Gate Leakage = 0.113134 W
  Runtime Dynamic = 5.3744 W
  Total Cores: 2 cores 
  Device Type= ITRS low operating power device type
    Area = 5.33485 mm^2
    Peak Dynamic = 1.07823 W
    Subthreshold Leakage = 0.0827641 W
    Gate Leakage = 0.0887315 W
    Runtime Dynamic = 0.975395 W
  Total First Level Directory: 
  Device Type= ITRS low operating power device type
    Area = 0.489711 mm^2
    Peak Dynamic = 0.0449752 W
    Subthreshold Leakage = 0.00397708 W
    Gate Leakage = 0.00655632 W
    Runtime Dynamic = 1.80289 W
  Total NoCs (Network/Bus): 
  Device Type= ITRS low operating power device type
    Area = 0.0162858 mm^2
    Peak Dynamic = 0.0187629 W
    Subthreshold Leakage = 0.000240784 W
    Gate Leakage = 0.000341076 W
    Runtime Dynamic = 0.0938146 W
  Total MCs: 1 Memory Controllers 
  Device Type= ITRS low operating power device type
    Area = 0.554183 mm^2
    Peak Dynamic = 0.31033 W
    Subthreshold Leakage = 0.0020922 W
    Gate Leakage = 0.00751531 W
    Runtime Dynamic = 2.21514 W
  Total Flash/SSD Controllers: 1 Flash/SSD Controllers 
  Device Type= ITRS low operating power device type
    Area = 0.109065 mm^2
    Peak Dynamic = 0.0299827 W
    Subthreshold Leakage = 0.000522213 W
    Gate Leakage = 0.0020015 W
    Runtime Dynamic = 0.0209879 W
  Total NIUs: 1 Network Interface Units 
  Device Type= ITRS low operating power device type
    Area = 0.261302 mm^2
    Peak Dynamic = 0.164859 W
    Subthreshold Leakage = 0.000730171 W
    Gate Leakage = 0.00279855 W
    Runtime Dynamic = 0.115402 W
  Total PCIes: 1 PCIe Controllers 
  Device Type= ITRS low operating power device type
    Area = 0.292355 mm^2
    Peak Dynamic = 0.215383 W
    Subthreshold Leakage = 0.00135405 W
    Gate Leakage = 0.00518971 W
    Runtime Dynamic = 0.150768 W
 *****************************************************************************************
 Core:
      Area = 2.66742 mm^2
      Peak Dynamic = 0.539116 W
      Subthreshold Leakage = 0.041382 W
      Gate Leakage = 0.0443657 W
      Runtime Dynamic = 0.975395 W
      Instruction Fetch Unit:
        Area = 0.565848 mm^2
        Peak Dynamic = 0.184724 W
        Subthreshold Leakage = 0.00572394 W
        Gate Leakage = 0.00380598 W
        Runtime Dynamic = 0.283222 W
          Instruction Cache:
            Area = 0.235613 mm^2
            Peak Dynamic = 0.0310428 W
            Subthreshold Leakage = 0.00309635 W
            Gate Leakage = 0.00216385 W
            Runtime Dynamic = 0.0461626 W
          Branch Target Buffer:
            Area = 0.251259 mm^2
            Peak Dynamic = 0.0174433 W
            Subthreshold Leakage = 0.00170231 W
            Gate Leakage = 0.000908123 W
            Runtime Dynamic = 0.0697733 W
          Branch Predictor:
            Area = 0.064441 mm^2
            Peak Dynamic = 0.00815792 W
            Subthreshold Leakage = 0.00070444 W
            Gate Leakage = 0.000477387 W
            Runtime Dynamic = 0.0113878 W
              Global Predictor:
                Area = 0.0313969 mm^2
                Peak Dynamic = 0.00374527 W
                Subthreshold Leakage = 0.00034631 W
                Gate Leakage = 0.000233555 W
                Runtime Dynamic = 0.00545806 W
              Local Predictor:
                Area = 0.000711939 mm^2
                Peak Dynamic = 0.000301014 W
                Subthreshold Leakage = 6.13457e-06 W
                Gate Leakage = 5.63471e-06 W
                Runtime Dynamic = 0.000471566 W
                Area = 0.000650815 mm^2
                Peak Dynamic = 0.000230123 W
                Subthreshold Leakage = 5.7769e-06 W
                Gate Leakage = 4.75075e-06 W
                Runtime Dynamic = 0.000354988 W
              Chooser:
                Area = 0.0313969 mm^2
                Peak Dynamic = 0.00374527 W
                Subthreshold Leakage = 0.00034631 W
                Gate Leakage = 0.000233555 W
                Runtime Dynamic = 0.00545806 W
              RAS:
                Area = 0.000996272 mm^2
                Peak Dynamic = 0.000366372 W
                Subthreshold Leakage = 5.68653e-06 W
                Gate Leakage = 4.64147e-06 W
                Runtime Dynamic = 6.23994e-08 W
          Instruction Buffer:
            Area = 0.00820192 mm^2
            Peak Dynamic = 0.0669878 W
            Subthreshold Leakage = 6.33536e-05 W
            Gate Leakage = 4.34841e-05 W
            Runtime Dynamic = 0.0382787 W
          Instruction Decoder:
            Area = 0.00468731 mm^2
            Peak Dynamic = 0.05881 W
            Subthreshold Leakage = 0.000127696 W
            Gate Leakage = 0.000115494 W
            Runtime Dynamic = 0.11762 W
      Renaming Unit:
        Area = 0.0903068 mm^2
        Peak Dynamic = 0.0451514 W
        Subthreshold Leakage = 0.000345688 W
        Gate Leakage = 0.00032022 W
        Runtime Dynamic = 0.0731287 W
          Int Front End RAT:
            Area = 0.0543672 mm^2
            Peak Dynamic = 0.0237617 W
            Subthreshold Leakage = 0.000175223 W
            Gate Leakage = 0.000121525 W
            Runtime Dynamic = 0.0475234 W
          FP Front End RAT:
            Area = 0.0185325 mm^2
            Peak Dynamic = 0.00949419 W
            Subthreshold Leakage = 0.000100325 W
            Gate Leakage = 6.76251e-05 W
            Runtime Dynamic = 0.00949419 W
          Free List:
            Area = 0.00599955 mm^2
            Peak Dynamic = 0.00225065 W
            Subthreshold Leakage = 1.24363e-05 W
            Gate Leakage = 1.00844e-05 W
            Runtime Dynamic = 0.0090026 W
          Int Retire RAT: 
            Area = 0.00605969 mm^2
            Peak Dynamic = 0.00448392 W
            Subthreshold Leakage = 1.33231e-05 W
            Gate Leakage = 1.16235e-05 W
            Runtime Dynamic = 0.00448392 W
          FP Retire RAT:
            Area = 0.000650815 mm^2
            Peak Dynamic = 0.00067334 W
            Subthreshold Leakage = 5.7769e-06 W
            Gate Leakage = 4.75075e-06 W
            Runtime Dynamic = 0.00067334 W
          FP Free List:
            Area = 0.00305098 mm^2
            Peak Dynamic = 0.00195124 W
            Subthreshold Leakage = 8.81712e-06 W
            Gate Leakage = 6.96054e-06 W
            Runtime Dynamic = 0.00195124 W
      Load Store Unit:
        Area = 0.274913 mm^2
        Peak Dynamic = 0.0347482 W
        Subthreshold Leakage = 0.0032012 W
        Gate Leakage = 0.00235752 W
        Runtime Dynamic = 0.195304 W
          Data Cache:
            Area = 0.240878 mm^2
            Peak Dynamic = 0.0293665 W
            Subthreshold Leakage = 0.00312878 W
            Gate Leakage = 0.00220794 W
            Runtime Dynamic = 0.19026 W
          StoreQ:
            Area = 0.00754674 mm^2
            Peak Dynamic = 0.00358087 W
            Subthreshold Leakage = 4.2633e-05 W
            Gate Leakage = 5.19212e-05 W
            Runtime Dynamic = 0.00504348 W
      Memory Management Unit:
        Area = 0.021508 mm^2
        Peak Dynamic = 0.0127337 W
        Subthreshold Leakage = 0.000210621 W
        Gate Leakage = 0.000290666 W
        Runtime Dynamic = 0.037071 W
          Itlb:
            Area = 0.00993091 mm^2
            Peak Dynamic = 0.00617846 W
            Subthreshold Leakage = 9.04168e-05 W
            Gate Leakage = 9.65082e-05 W
            Runtime Dynamic = 0.012357 W
          Dtlb:
            Area = 0.00993091 mm^2
            Peak Dynamic = 0.00438671 W
            Subthreshold Leakage = 9.04168e-05 W
            Gate Leakage = 9.65082e-05 W
            Runtime Dynamic = 0.0247139 W
      Execution Unit:
        Area = 1.65498 mm^2
        Peak Dynamic = 0.261758 W
        Subthreshold Leakage = 0.0305522 W
        Gate Leakage = 0.0360036 W
        Runtime Dynamic = 0.386669 W
          Register Files:
            Area = 0.203203 mm^2
            Peak Dynamic = 0.0763282 W
            Subthreshold Leakage = 0.000197046 W
            Gate Leakage = 0.00016338 W
            Runtime Dynamic = 0.0386066 W
              Integer RF:
                Area = 0.146073 mm^2
                Peak Dynamic = 0.0763282 W
                Subthreshold Leakage = 0.000120303 W
                Gate Leakage = 9.97867e-05 W
                Runtime Dynamic = 0.0345689 W
              Floating Point RF:
                Area = 0.05713 mm^2
                Peak Dynamic = 0 W
                Subthreshold Leakage = 7.67427e-05 W
                Gate Leakage = 6.35938e-05 W
                Runtime Dynamic = 0.00403765 W
          Instruction Scheduler:
            Area = 0.0582889 mm^2
            Peak Dynamic = 0.0522571 W
            Subthreshold Leakage = 0.000128698 W
            Gate Leakage = 0.000185714 W
            Runtime Dynamic = 0.0787473 W
              Instruction Window:
                Area = 0.053925 mm^2
                Peak Dynamic = 0.0445895 W
                Subthreshold Leakage = 9.52936e-05 W
                Gate Leakage = 0.000130718 W
                Runtime Dynamic = 0.0602231 W
              FP Instruction Window:
                Area = 0.00436388 mm^2
                Peak Dynamic = 0.00766759 W
                Subthreshold Leakage = 3.34043e-05 W
                Gate Leakage = 5.49962e-05 W
                Runtime Dynamic = 0.0185242 W
          Integer ALUs (Count: 3 ):
            Area = 0.312404 mm^2
            Peak Dynamic = 0.0283684 W
            Subthreshold Leakage = 0.0140724 W
            Gate Leakage = 0.0165703 W
            Runtime Dynamic = 0.0373268 W
          Floating Point Units (FPUs) (Count: 1 ):
            Area = 0.971259 mm^2
            Peak Dynamic = 0 W
            Subthreshold Leakage = 0.0109377 W
            Gate Leakage = 0.0128792 W
            Runtime Dynamic = 0.0373268 W
          Complex ALUs (Mul/Div) (Count: 1 ):
            Area = 0.104135 mm^2
            Peak Dynamic = 0.0204053 W
            Subthreshold Leakage = 0.00469079 W
            Gate Leakage = 0.00552345 W
            Runtime Dynamic = 0.049769 W
          Results Broadcast Bus:
            Area Overhead = 0.00404385 mm^2
            Peak Dynamic = 0.0824719 W
            Subthreshold Leakage = 0.000495836 W
            Gate Leakage = 0.000583852 W
            Runtime Dynamic = 0.144892 W
 *****************************************************************************************
 First Level Directory
      Area = 0.244856 mm^2
      Peak Dynamic = 0.0224876 W
      Subthreshold Leakage = 0.00198854 W
      Gate Leakage = 0.00327816 W
      Runtime Dynamic = 1.80289 W
 *****************************************************************************************
 Memory Controller:
      Area = 0.554183 mm^2
      Peak Dynamic = 0.31033 W
      Subthreshold Leakage = 0.0020922 W
      Gate Leakage = 0.00751531 W
      Runtime Dynamic = 2.21514 W
      Front End Engine:
        Area = 0.111447 mm^2
        Peak Dynamic = 0.0117646 W
        Subthreshold Leakage = 0.000188068 W
        Gate Leakage = 0.000217277 W
        Runtime Dynamic = 0.0796061 W
      Transaction Engine:
        Area = 0.113609 mm^2
        Peak Dynamic = 0.160252 W
        Subthreshold Leakage = 0.000380826 W
        Gate Leakage = 0.00145961 W
        Runtime Dynamic = 1.08436 W
      PHY:
        Area = 0.329127 mm^2
        Peak Dynamic = 0.138314 W
        Subthreshold Leakage = 0.00152331 W
        Gate Leakage = 0.00583843 W
        Runtime Dynamic = 1.05117 W
 *****************************************************************************************
 Flash Controller:
      Area = 0.109065 mm^2
      Peak Dynamic = 0.0299827 W
      Subthreshold Leakage = 0.000522213 W
      Gate Leakage = 0.0020015 W
      Runtime Dynamic = 0.0209879 W
 *****************************************************************************************
 NIU:
      Area = 0.261302 mm^2
      Peak Dynamic = 0.164859 W
      Subthreshold Leakage = 0.000730171 W
      Gate Leakage = 0.00279855 W
      Runtime Dynamic = 0.115402 W
 *****************************************************************************************
 PCIe:
      Area = 0.292355 mm^2
      Peak Dynamic = 0.215383 W
      Subthreshold Leakage = 0.00135405 W
      Gate Leakage = 0.00518971 W
      Runtime Dynamic = 0.150768 W
 *****************************************************************************************
 BUSES
      Area = 0.0162858 mm^2
      Peak Dynamic = 0.0187629 W
      Subthreshold Leakage = 0.000240784 W
      Gate Leakage = 0.000341076 W
      Runtime Dynamic = 0.0938146 W
      Bus: 
        Area = 0.0162858 mm^2
        Peak Dynamic = 0.0187629 W
        Subthreshold Leakage = 0.000240784 W
        Gate Leakage = 0.000341076 W
        Runtime Dynamic = 0.0938146 W
 *****************************************************************************************
--- a/ext/mcpat/results/A9_800
+++ b/ext/mcpat/results/A9_800
@ -0,0 +1,320 @@
 McPAT (version 0.8 of Aug, 2010) is computing the target processor...
 McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
 *****************************************************************************************
  Technology 40 nm
  Using Long Channel Devices When Appropriate
  Interconnect metal projection= conservative interconnect technology projection
  Core clock Rate(MHz) 800
 *****************************************************************************************
 Processor: 
  Area = 5.48929 mm^2
  Peak Power = 0.577263 W
  Total Leakage = 0.127046 W
  Peak Dynamic = 0.450217 W
  Subthreshold Leakage = 0.0608257 W
  Gate Leakage = 0.0662198 W
  Runtime Dynamic = 1.13304 W
  Total Cores: 
  Device Type= ITRS low operating power device type
    Area = 4.98521 mm^2
    Peak Dynamic = 0.425609 W
    Subthreshold Leakage = 0.0577408 W
    Gate Leakage = 0.061241 W
    Runtime Dynamic = 0.37879 W
  Total First Level Directory: 
  Device Type= ITRS low operating power device type
    Area = 0.489711 mm^2
    Peak Dynamic = 0.0179901 W
    Subthreshold Leakage = 0.0029286 W
    Gate Leakage = 0.00476045 W
    Runtime Dynamic = 0.721156 W
  Total NoCs (Network/Bus): 
  Device Type= ITRS low operating power device type
    Area = 0.0143604 mm^2
    Peak Dynamic = 0.00661787 W
    Subthreshold Leakage = 0.000156344 W
    Gate Leakage = 0.000218372 W
    Runtime Dynamic = 0.0330893 W
 *****************************************************************************************
 Core:
      Area = 2.49261 mm^2
      Peak Dynamic = 0.212805 W
      Subthreshold Leakage = 0.0288704 W
      Gate Leakage = 0.0306205 W
      Runtime Dynamic = 0.37879 W
      Instruction Fetch Unit:
        Area = 0.450898 mm^2
        Peak Dynamic = 0.0710479 W
        Subthreshold Leakage = 0.00360576 W
        Gate Leakage = 0.00232348 W
        Runtime Dynamic = 0.101921 W
          Instruction Cache:
            Area = 0.235613 mm^2
            Peak Dynamic = 0.0124171 W
            Subthreshold Leakage = 0.00228006 W
            Gate Leakage = 0.00157114 W
            Runtime Dynamic = 0.018465 W
          Branch Target Buffer:
            Area = 0.136309 mm^2
            Peak Dynamic = 0.00413545 W
            Subthreshold Leakage = 0.000644359 W
            Gate Leakage = 0.000219381 W
            Runtime Dynamic = 0.0165418 W
          Branch Predictor:
            Area = 0.064441 mm^2
            Peak Dynamic = 0.00326317 W
            Subthreshold Leakage = 0.000518728 W
            Gate Leakage = 0.000346624 W
            Runtime Dynamic = 0.0045551 W
              Global Predictor:
                Area = 0.0313969 mm^2
                Peak Dynamic = 0.00149811 W
                Subthreshold Leakage = 0.000255012 W
                Gate Leakage = 0.000169581 W
                Runtime Dynamic = 0.00218323 W
              Local Predictor:
                Area = 0.000711939 mm^2
                Peak Dynamic = 0.000120406 W
                Subthreshold Leakage = 4.51731e-06 W
                Gate Leakage = 4.09128e-06 W
                Runtime Dynamic = 0.000188626 W
                Area = 0.000650815 mm^2
                Peak Dynamic = 9.20494e-05 W
                Subthreshold Leakage = 4.25393e-06 W
                Gate Leakage = 3.44945e-06 W
                Runtime Dynamic = 0.000141995 W
              Chooser:
                Area = 0.0313969 mm^2
                Peak Dynamic = 0.00149811 W
                Subthreshold Leakage = 0.000255012 W
                Gate Leakage = 0.000169581 W
                Runtime Dynamic = 0.00218323 W
              RAS:
                Area = 0.000996272 mm^2
                Peak Dynamic = 0.000146549 W
                Subthreshold Leakage = 4.18739e-06 W
                Gate Leakage = 3.3701e-06 W
                Runtime Dynamic = 2.49598e-08 W
          Instruction Buffer:
            Area = 0.00820192 mm^2
            Peak Dynamic = 0.0267951 W
            Subthreshold Leakage = 4.66516e-05 W
            Gate Leakage = 3.15732e-05 W
            Runtime Dynamic = 0.0153115 W
          Instruction Decoder:
            Area = 0.00468731 mm^2
            Peak Dynamic = 0.023524 W
            Subthreshold Leakage = 9.40317e-05 W
            Gate Leakage = 8.38587e-05 W
            Runtime Dynamic = 0.047048 W
      Renaming Unit:
        Area = 0.0903068 mm^2
        Peak Dynamic = 0.0180606 W
        Subthreshold Leakage = 0.000254554 W
        Gate Leakage = 0.000232507 W
        Runtime Dynamic = 0.0292515 W
          Int Front End RAT:
            Area = 0.0543672 mm^2
            Peak Dynamic = 0.00950468 W
            Subthreshold Leakage = 0.000129029 W
            Gate Leakage = 8.82378e-05 W
            Runtime Dynamic = 0.0190094 W
          FP Front End RAT:
            Area = 0.0185325 mm^2
            Peak Dynamic = 0.00379768 W
            Subthreshold Leakage = 7.38761e-05 W
            Gate Leakage = 4.91016e-05 W
            Runtime Dynamic = 0.00379768 W
          Free List:
            Area = 0.00599955 mm^2
            Peak Dynamic = 0.00090026 W
            Subthreshold Leakage = 9.15772e-06 W
            Gate Leakage = 7.32213e-06 W
            Runtime Dynamic = 0.00360104 W
          Int Retire RAT: 
            Area = 0.00605969 mm^2
            Peak Dynamic = 0.00179357 W
            Subthreshold Leakage = 9.8107e-06 W
            Gate Leakage = 8.43969e-06 W
            Runtime Dynamic = 0.00179357 W
          FP Retire RAT:
            Area = 0.000650815 mm^2
            Peak Dynamic = 0.000269336 W
            Subthreshold Leakage = 4.25393e-06 W
            Gate Leakage = 3.44945e-06 W
            Runtime Dynamic = 0.000269336 W
          FP Free List:
            Area = 0.00305098 mm^2
            Peak Dynamic = 0.000780497 W
            Subthreshold Leakage = 6.49266e-06 W
            Gate Leakage = 5.05395e-06 W
            Runtime Dynamic = 0.000780497 W
      Load Store Unit:
        Area = 0.274913 mm^2
        Peak Dynamic = 0.0138993 W
        Subthreshold Leakage = 0.00235727 W
        Gate Leakage = 0.00171176 W
        Runtime Dynamic = 0.0781216 W
          Data Cache:
            Area = 0.240878 mm^2
            Peak Dynamic = 0.0117466 W
            Subthreshold Leakage = 0.00230394 W
            Gate Leakage = 0.00160316 W
            Runtime Dynamic = 0.0761042 W
          StoreQ:
            Area = 0.00754674 mm^2
            Peak Dynamic = 0.00143235 W
            Subthreshold Leakage = 3.13936e-05 W
            Gate Leakage = 3.76992e-05 W
            Runtime Dynamic = 0.00201739 W
      Memory Management Unit:
        Area = 0.021508 mm^2
        Peak Dynamic = 0.0050935 W
        Subthreshold Leakage = 0.000155095 W
        Gate Leakage = 0.000211049 W
        Runtime Dynamic = 0.0148284 W
          Itlb:
            Area = 0.00993091 mm^2
            Peak Dynamic = 0.00247139 W
            Subthreshold Leakage = 6.65801e-05 W
            Gate Leakage = 7.00732e-05 W
            Runtime Dynamic = 0.0049428 W
          Dtlb:
            Area = 0.00993091 mm^2
            Peak Dynamic = 0.00175468 W
            Subthreshold Leakage = 6.65801e-05 W
            Gate Leakage = 7.00732e-05 W
            Runtime Dynamic = 0.00988557 W
      Execution Unit:
        Area = 1.65498 mm^2
        Peak Dynamic = 0.104703 W
        Subthreshold Leakage = 0.0224977 W
        Gate Leakage = 0.0261417 W
        Runtime Dynamic = 0.154667 W
          Register Files:
            Area = 0.203203 mm^2
            Peak Dynamic = 0.0305313 W
            Subthreshold Leakage = 0.000145099 W
            Gate Leakage = 0.000118628 W
            Runtime Dynamic = 0.0154426 W
              Integer RF:
                Area = 0.146073 mm^2
                Peak Dynamic = 0.0305313 W
                Subthreshold Leakage = 8.85877e-05 W
                Gate Leakage = 7.24537e-05 W
                Runtime Dynamic = 0.0138276 W
              Floating Point RF:
                Area = 0.05713 mm^2
                Peak Dynamic = 0 W
                Subthreshold Leakage = 5.6511e-05 W
                Gate Leakage = 4.61745e-05 W
                Runtime Dynamic = 0.00161506 W
          Instruction Scheduler:
            Area = 0.0582889 mm^2
            Peak Dynamic = 0.0209028 W
            Subthreshold Leakage = 9.47693e-05 W
            Gate Leakage = 0.000134844 W
            Runtime Dynamic = 0.0314989 W
              Instruction Window:
                Area = 0.053925 mm^2
                Peak Dynamic = 0.0178358 W
                Subthreshold Leakage = 7.01713e-05 W
                Gate Leakage = 9.49122e-05 W
                Runtime Dynamic = 0.0240893 W
              FP Instruction Window:
                Area = 0.00436388 mm^2
                Peak Dynamic = 0.00306704 W
                Subthreshold Leakage = 2.45979e-05 W
                Gate Leakage = 3.99319e-05 W
                Runtime Dynamic = 0.00740966 W
          Integer ALUs (Count: 3 ):
            Area = 0.312404 mm^2
            Peak Dynamic = 0.0113473 W
            Subthreshold Leakage = 0.0103625 W
            Gate Leakage = 0.0120315 W
            Runtime Dynamic = 0.0149307 W
          Floating Point Units (FPUs) (Count: 1 ):
            Area = 0.971259 mm^2
            Peak Dynamic = 0 W
            Subthreshold Leakage = 0.00805417 W
            Gate Leakage = 0.00935142 W
            Runtime Dynamic = 0.0149307 W
          Complex ALUs (Mul/Div) (Count: 1 ):
            Area = 0.104135 mm^2
            Peak Dynamic = 0.00816212 W
            Subthreshold Leakage = 0.00345415 W
            Gate Leakage = 0.0040105 W
            Runtime Dynamic = 0.0199076 W
          Results Broadcast Bus:
            Area Overhead = 0.00404385 mm^2
            Peak Dynamic = 0.0329888 W
            Subthreshold Leakage = 0.000365119 W
            Gate Leakage = 0.000423926 W
            Runtime Dynamic = 0.0579569 W
 *****************************************************************************************
 First Level Directory
      Area = 0.244856 mm^2
      Peak Dynamic = 0.00899504 W
      Subthreshold Leakage = 0.0014643 W
      Gate Leakage = 0.00238022 W
      Runtime Dynamic = 0.721156 W
 *****************************************************************************************
 BUSES
      Area = 0.0143604 mm^2
      Peak Dynamic = 0.00661787 W
      Subthreshold Leakage = 0.000156344 W
      Gate Leakage = 0.000218372 W
      Runtime Dynamic = 0.0330893 W
      Bus: 
        Area = 0.0143604 mm^2
        Peak Dynamic = 0.00661787 W
        Subthreshold Leakage = 0.000156344 W
        Gate Leakage = 0.000218372 W
        Runtime Dynamic = 0.0330893 W
 *****************************************************************************************
--- a/ext/mcpat/results/Alpha21364
+++ b/ext/mcpat/results/Alpha21364
@ -0,0 +1,441 @@
 McPAT (version 0.7 of May, 2010) is computing the target processor...
 Warning: icache array structure cannot satisfy throughput constraint.
 Warning: icache array structure cannot satisfy latency constraint.
 Warning: InstBuffer array structure cannot satisfy throughput constraint.
 Warning: InstBuffer array structure cannot satisfy latency constraint.
 Warning: Branch Target Buffer array structure cannot satisfy throughput constraint.
 Warning: Branch Target Buffer array structure cannot satisfy latency constraint.
 Warning: Global Predictor array structure cannot satisfy throughput constraint.
 Warning: Global Predictor array structure cannot satisfy latency constraint.
 Warning: L1 local Predictor array structure cannot satisfy throughput constraint.
 Warning: L1 local Predictor array structure cannot satisfy latency constraint.
 Warning: L2 local Predictor array structure cannot satisfy throughput constraint.
 Warning: L2 local Predictor array structure cannot satisfy latency constraint.
 Warning: Predictor Chooser array structure cannot satisfy throughput constraint.
 Warning: Predictor Chooser array structure cannot satisfy latency constraint.
 Warning: RAS array structure cannot satisfy throughput constraint.
 Warning: RAS array structure cannot satisfy latency constraint.
 Warning: dcache array structure cannot satisfy throughput constraint.
 Warning: dcache array structure cannot satisfy latency constraint.
 Warning: Integer Register File array structure cannot satisfy throughput constraint.
 Warning: Integer Register File array structure cannot satisfy latency constraint.
 Warning: Floating point Register File array structure cannot satisfy throughput constraint.
 Warning: Floating point Register File array structure cannot satisfy latency constraint.
 Warning: ReorderBuffer array structure cannot satisfy throughput constraint.
 Warning: ReorderBuffer array structure cannot satisfy latency constraint.
 Warning: Int RetireRAT array structure cannot satisfy throughput constraint.
 Warning: Int RetireRAT array structure cannot satisfy latency constraint.
 Warning: Int RetireRAT array structure cannot satisfy latency constraint.
 Warning: Int Free List array structure cannot satisfy throughput constraint.
 Warning: Int Free List array structure cannot satisfy latency constraint.
 Warning: Int Free List array structure cannot satisfy throughput constraint.
 Warning: Int Free List array structure cannot satisfy latency constraint.
 Warning: MC ReadBuffer array structure cannot satisfy throughput constraint.
 Warning: MC ReadBuffer array structure cannot satisfy latency constraint.
 Warning: MC writeBuffer array structure cannot satisfy throughput constraint.
 Warning: MC writeBuffer array structure cannot satisfy latency constraint.
 McPAT (version 0.7 of May, 2010) results  (current print level is 5)
 *****************************************************************************************
  Technology 180 nm
  Interconnect metal projection= aggressive interconnect technology projection
  Core clock Rate(MHz) 1200
 *****************************************************************************************
 Processor: 
  Area = 323.859 mm^2
  Peak Power = 90.0375 W
  Total Leakage = 0.156795 W
  Peak Dynamic = 89.8807 W
  Subthreshold Leakage = 0.151936 W
  Gate Leakage = 0.00485969 W
  Runtime Dynamic = 85.2036 W
  Total Cores: 
  Device Type= ITRS high performance device type
    Area = 137.839 mm^2
    Peak Dynamic = 60.6776 W
    Subthreshold Leakage = 0.067186 W
    Gate Leakage = 0.00428355 W
    Runtime Dynamic = 73.9555 W
  Total L2s: 
  Device Type= ITRS high performance device type
    Area = 137.063 mm^2
    Peak Dynamic = 3.55835 W
    Subthreshold Leakage = 0.0778886 W
    Gate Leakage = 0.00016078 W
    Runtime Dynamic = 6.34872 W
  Total First Level Directory: 
  Device Type= ITRS high performance device type
    Area = 1.59954 mm^2
    Peak Dynamic = 0.805902 W
    Subthreshold Leakage = 0.000311783 W
    Gate Leakage = 2.63568e-05 W
    Runtime Dynamic = 0.547665 W
  Total NoCs (Network/Bus): 
  Device Type= ITRS high performance device type
    Area = 29.1057 mm^2
    Peak Dynamic = 16.5188 W
    Subthreshold Leakage = 0.00292556 W
    Gate Leakage = 0.000166293 W
    Runtime Dynamic = 2.54446 W
  Total MCs: 
  Device Type= ITRS high performance device type
    Area = 18.2519 mm^2
    Peak Dynamic = 8.32001 W
    Subthreshold Leakage = 0.00362353 W
    Gate Leakage = 0.000222708 W
    Runtime Dynamic = 1.80731 W
 *****************************************************************************************
 Core:
      Area = 137.839 mm^2
      Peak Dynamic = 60.6776 W
      Subthreshold Leakage = 0.067186 W
      Gate Leakage = 0.00428355 W
      Runtime Dynamic = 73.9555 W
      Instruction Fetch Unit:
        Area = 27.6096 mm^2
        Peak Dynamic = 9.86655 W
        Subthreshold Leakage = 0.00622106 W
        Gate Leakage = 0.000344671 W
        Runtime Dynamic = 10.0567 W
          Instruction Cache:
            Area = 11.4511 mm^2
            Peak Dynamic = 1.53259 W
            Subthreshold Leakage = 0.00371341 W
            Gate Leakage = 0.000171069 W
            Runtime Dynamic = 2.13168 W
          Branch Target Buffer:
            Area = 13.3377 mm^2
            Peak Dynamic = 0.56236 W
            Subthreshold Leakage = 0.001581 W
            Gate Leakage = 9.5198e-05 W
            Runtime Dynamic = 2.24944 W
          Branch Predictor:
            Area = 2.1618 mm^2
            Peak Dynamic = 0.234643 W
            Subthreshold Leakage = 0.000469396 W
            Gate Leakage = 2.01907e-05 W
            Runtime Dynamic = 0.198646 W
              Global Predictor:
                Area = 0.893575 mm^2
                Peak Dynamic = 0.0726984 W
                Subthreshold Leakage = 0.000182866 W
                Gate Leakage = 7.91951e-06 W
                Runtime Dynamic = 0.0726984 W
              Local Predictor:
                Area = 0.420241 mm^2
                Peak Dynamic = 0.0532456 W
                Subthreshold Leakage = 9.20027e-05 W
                Gate Leakage = 3.89162e-06 W
                Runtime Dynamic = 0.0532456 W
                Area = 0.291886 mm^2
                Peak Dynamic = 0.0292091 W
                Subthreshold Leakage = 5.262e-05 W
                Gate Leakage = 2.51093e-06 W
                Runtime Dynamic = 0.0292091 W
              Chooser:
                Area = 0.893575 mm^2
                Peak Dynamic = 0.0726984 W
                Subthreshold Leakage = 0.000182866 W
                Gate Leakage = 7.91951e-06 W
                Runtime Dynamic = 0.0726984 W
              RAS:
                Area = 0.0827607 mm^2
                Peak Dynamic = 0.0360009 W
                Subthreshold Leakage = 1.16623e-05 W
                Gate Leakage = 4.60036e-07 W
                Runtime Dynamic = 3.58028e-06 W
          Instruction Buffer:
            Area = 0.465385 mm^2
            Peak Dynamic = 2.10455 W
            Subthreshold Leakage = 6.13248e-05 W
            Gate Leakage = 4.88113e-06 W
            Runtime Dynamic = 1.40303 W
          Instruction Decoder:
            Area = 0.146031 mm^2
            Peak Dynamic = 4.07384 W
            Subthreshold Leakage = 7.07416e-05 W
            Gate Leakage = 3.32268e-06 W
            Runtime Dynamic = 4.07384 W
      Renaming Unit:
        Area = 11.7262 mm^2
        Peak Dynamic = 12.5584 W
        Subthreshold Leakage = 0.000886804 W
        Gate Leakage = 9.92419e-05 W
        Runtime Dynamic = 9.90647 W
          Int Front End RAT:
            Area = 8.24345 mm^2
            Peak Dynamic = 8.04227 W
            Subthreshold Leakage = 0.000376247 W
            Gate Leakage = 3.40623e-05 W
            Runtime Dynamic = 8.04227 W
          FP Front End RAT:
            Area = 2.549 mm^2
            Peak Dynamic = 2.75082 W
            Subthreshold Leakage = 0.000149367 W
            Gate Leakage = 1.30084e-05 W
            Runtime Dynamic = 1.37541 W
          Free List:
            Area = 0.446019 mm^2
            Peak Dynamic = 0.156051 W
            Subthreshold Leakage = 1.32133e-05 W
            Gate Leakage = 7.4667e-07 W
            Runtime Dynamic = 0.312102 W
          Int Retire RAT: 
            Area = 0.184445 mm^2
            Peak Dynamic = 0.102656 W
            Subthreshold Leakage = 8.50239e-06 W
            Gate Leakage = 5.28869e-07 W
            Runtime Dynamic = 0.102656 W
          FP Retire RAT:
            Area = 0.0567228 mm^2
            Peak Dynamic = 0.0367258 W
            Subthreshold Leakage = 5.67894e-06 W
            Gate Leakage = 3.75578e-07 W
            Runtime Dynamic = 0.0183629 W
          FP Free List:
            Area = 0.198929 mm^2
            Peak Dynamic = 0.111293 W
            Subthreshold Leakage = 8.61952e-06 W
            Gate Leakage = 5.10875e-07 W
            Runtime Dynamic = 0.0556467 W
      Load Store Unit:
        Area = 49.742 mm^2
        Peak Dynamic = 11.7952 W
        Subthreshold Leakage = 0.00715349 W
        Gate Leakage = 0.00052778 W
        Runtime Dynamic = 31.7658 W
          Data Cache:
            Area = 36.106 mm^2
            Peak Dynamic = 9.28008 W
            Subthreshold Leakage = 0.00663485 W
            Gate Leakage = 0.000466572 W
            Runtime Dynamic = 31.332 W
          LoadQ:
            Area = 2.60005 mm^2
            Peak Dynamic = 0.578279 W
            Subthreshold Leakage = 9.67302e-05 W
            Gate Leakage = 5.59905e-06 W
            Runtime Dynamic = 0.14457 W
          StoreQ:
            Area = 2.60005 mm^2
            Peak Dynamic = 0.578279 W
            Subthreshold Leakage = 9.67302e-05 W
            Gate Leakage = 5.59905e-06 W
            Runtime Dynamic = 0.289139 W
      Memory Management Unit:
        Area = 8.74543 mm^2
        Peak Dynamic = 3.77198 W
        Subthreshold Leakage = 0.00119904 W
        Gate Leakage = 0.000127183 W
        Runtime Dynamic = 4.82688 W
          Itlb:
            Area = 1.97969 mm^2
            Peak Dynamic = 0.537563 W
            Subthreshold Leakage = 0.000270576 W
            Gate Leakage = 2.0845e-05 W
            Runtime Dynamic = 1.07513 W
          Dtlb:
            Area = 6.71814 mm^2
            Peak Dynamic = 1.87586 W
            Subthreshold Leakage = 0.00060329 W
            Gate Leakage = 5.63286e-05 W
            Runtime Dynamic = 3.75174 W
      Execution Unit:
        Area = 31.4918 mm^2
        Peak Dynamic = 22.6855 W
        Subthreshold Leakage = 0.0320294 W
        Gate Leakage = 0.00198102 W
        Runtime Dynamic = 17.3997 W
          Register Files:
            Area = 9.9318 mm^2
            Peak Dynamic = 3.92301 W
            Subthreshold Leakage = 0.000295352 W
            Gate Leakage = 1.33517e-05 W
            Runtime Dynamic = 1.7929 W
              Integer RF:
                Area = 6.76678 mm^2
                Peak Dynamic = 2.35597 W
                Subthreshold Leakage = 0.000185762 W
                Gate Leakage = 8.51701e-06 W
                Runtime Dynamic = 1.60634 W
              Floating Point RF:
                Area = 3.16503 mm^2
                Peak Dynamic = 1.56704 W
                Subthreshold Leakage = 0.00010959 W
                Gate Leakage = 4.83467e-06 W
                Runtime Dynamic = 0.186553 W
          Instruction Scheduler:
            Area = 5.20691 mm^2
            Peak Dynamic = 2.77224 W
            Subthreshold Leakage = 0.000202187 W
            Gate Leakage = 1.05832e-05 W
            Runtime Dynamic = 3.11355 W
              Instruction Window:
                Area = 1.23862 mm^2
                Peak Dynamic = 0.985117 W
                Subthreshold Leakage = 5.55506e-05 W
                Gate Leakage = 3.78978e-06 W
                Runtime Dynamic = 1.23906 W
              FP Instruction Window:
                Area = 0.481718 mm^2
                Peak Dynamic = 0.438839 W
                Subthreshold Leakage = 2.5962e-05 W
                Gate Leakage = 2.00351e-06 W
                Runtime Dynamic = 0.526208 W
              ROB:
                Area = 3.48657 mm^2
                Peak Dynamic = 1.34828 W
                Subthreshold Leakage = 0.000120674 W
                Gate Leakage = 4.78991e-06 W
                Runtime Dynamic = 1.34828 W
          Integer ALUs (Count: 4 ):
            Area = 3.4944 mm^2
            Peak Dynamic = 4.23312 W
            Subthreshold Leakage = 0.016149 W
            Gate Leakage = 0.000986885 W
            Runtime Dynamic = 3.21343 W
          Floating Point Units (FPUs) (Count: 1 ):
            Area = 12.705 mm^2
            Peak Dynamic = 3.52215 W
            Subthreshold Leakage = 0.0146787 W
            Gate Leakage = 0.000897034 W
            Runtime Dynamic = 3.52215 W
          Results Broadcast Bus:
            Area Overhead = 0.106062 mm^2
            Peak Dynamic = 6.87645 W
            Subthreshold Leakage = 0.000378957 W
            Gate Leakage = 2.31585e-05 W
            Runtime Dynamic = 5.75766 W
 *****************************************************************************************
 L2
      Area = 137.063 mm^2
      Peak Dynamic = 3.55835 W
      Subthreshold Leakage = 0.0778886 W
      Gate Leakage = 0.00016078 W
      Runtime Dynamic = 6.34872 W
 *****************************************************************************************
 Second Level Directory
      Area = 1.59954 mm^2
      Peak Dynamic = 0.805902 W
      Subthreshold Leakage = 0.000311783 W
      Gate Leakage = 2.63568e-05 W
      Runtime Dynamic = 0.547665 W
 *****************************************************************************************
 Memory Controller:
      Area = 9.12595 mm^2
      Peak Dynamic = 4.16 W
      Subthreshold Leakage = 0.00181177 W
      Gate Leakage = 0.000111354 W
      Runtime Dynamic = 1.80731 W
      Front End Engine:
        Area = 5.49326 mm^2
        Peak Dynamic = 1.42883 W
        Subthreshold Leakage = 0.000132955 W
        Gate Leakage = 8.76015e-06 W
        Runtime Dynamic = 0.348049 W
      Transaction Engine:
        Area = 1.50616 mm^2
        Peak Dynamic = 1.93117 W
        Subthreshold Leakage = 0.000696058 W
        Gate Leakage = 4.25369e-05 W
        Runtime Dynamic = 0.579332 W
      PHY:
        Area = 2.12653 mm^2
        Peak Dynamic = 0.8 W
        Subthreshold Leakage = 0.000982753 W
        Gate Leakage = 6.00571e-05 W
        Runtime Dynamic = 0.879928 W
 *****************************************************************************************
 NOC
      Area = 29.1057 mm^2
      Peak Dynamic = 16.5188 W
      Subthreshold Leakage = 0.00292556 W
      Gate Leakage = 0.000166293 W
      Runtime Dynamic = 2.54446 W
      Router: 
        Area = 28.4197 mm^2
        Peak Dynamic = 8.76431 W
        Subthreshold Leakage = 0.00199965 W
        Gate Leakage = 0.000109709 W
        Runtime Dynamic = 1.25204 W
            Virtual Channel Buffer:
              Area = 17.0424 mm^2
              Peak Dynamic = 7.30291 W
              Subthreshold Leakage = 0.00119658 W
              Gate Leakage = 4.15511e-05 W
              Runtime Dynamic = 1.04327 W
            Crossbar:
              Area = 0.357655 mm^2
              Peak Dynamic = 1.27997 W
              Subthreshold Leakage = 0.000801415 W
              Gate Leakage = 6.80527e-05 W
              Runtime Dynamic = 0.182853 W
            Arbiter:
              Peak Dynamic = 0.18143 W
              Subthreshold Leakage = 1.65956e-06 W
              Gate Leakage = 1.05559e-07 W
              Runtime Dynamic = 0.0259186 W
      Per Router : 
        Area = 0.685989 mm^2
        Peak Dynamic = 7.75447 W
        Subthreshold Leakage = 0.000925911 W
        Gate Leakage = 5.65834e-05 W
        Runtime Dynamic = 1.29241 W
 *****************************************************************************************
--- a/ext/mcpat/results/Alpha21364_90nm
+++ b/ext/mcpat/results/Alpha21364_90nm
@ -0,0 +1,408 @@
 McPAT (version 0.8 of Aug, 2010) is computing the target processor...
 Warning: icache array structure cannot satisfy latency constraint.
 Warning: dcache array structure cannot satisfy latency constraint.
 McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
 *****************************************************************************************
  Technology 90 nm
  Interconnect metal projection= aggressive interconnect technology projection
  Core clock Rate(MHz) 1200
 *****************************************************************************************
 Processor: 
  Area = 139.86 mm^2
  Peak Power = 34.9936 W
  Total Leakage = 4.16949 W
  Peak Dynamic = 30.8241 W
  Subthreshold Leakage = 3.86203 W
  Gate Leakage = 0.307463 W
  Runtime Dynamic = 34.0612 W
  Total Cores: 
  Device Type= ITRS high performance device type
    Area = 61.1957 mm^2
    Peak Dynamic = 19.6269 W
    Subthreshold Leakage = 2.04452 W
    Gate Leakage = 0.277429 W
    Runtime Dynamic = 29.5972 W
  Total L2s: 
  Device Type= ITRS high performance device type
    Area = 62.2653 mm^2
    Peak Dynamic = 1.42987 W
    Subthreshold Leakage = 1.65481 W
    Gate Leakage = 0.00860545 W
    Runtime Dynamic = 2.73329 W
  Total First Level Directory: 
  Device Type= ITRS high performance device type
    Area = 0.533824 mm^2
    Peak Dynamic = 0.275566 W
    Subthreshold Leakage = 0.00929753 W
    Gate Leakage = 0.00179126 W
    Runtime Dynamic = 0.193681 W
  Total NoCs (Network/Bus): 
  Device Type= ITRS high performance device type
    Area = 8.77595 mm^2
    Peak Dynamic = 6.17873 W
    Subthreshold Leakage = 0.108357 W
    Gate Leakage = 0.0139259 W
    Runtime Dynamic = 0.963385 W
  Total MCs: 
  Device Type= ITRS high performance device type
    Area = 7.08925 mm^2
    Peak Dynamic = 3.3131 W
    Subthreshold Leakage = 0.0450389 W
    Gate Leakage = 0.00571171 W
    Runtime Dynamic = 0.573656 W
 *****************************************************************************************
 Core:
      Area = 61.1957 mm^2
      Peak Dynamic = 19.6269 W
      Subthreshold Leakage = 2.04452 W
      Gate Leakage = 0.277429 W
      Runtime Dynamic = 29.5972 W
      Instruction Fetch Unit:
        Area = 7.40352 mm^2
        Peak Dynamic = 2.10646 W
        Subthreshold Leakage = 0.126581 W
        Gate Leakage = 0.0150397 W
        Runtime Dynamic = 2.55478 W
          Instruction Cache:
            Area = 5.01657 mm^2
            Peak Dynamic = 0.745807 W
            Subthreshold Leakage = 0.0906167 W
            Gate Leakage = 0.010922 W
            Runtime Dynamic = 1.22193 W
          Branch Target Buffer:
            Area = 1.63475 mm^2
            Peak Dynamic = 0.0974373 W
            Subthreshold Leakage = 0.0188281 W
            Gate Leakage = 0.00126965 W
            Runtime Dynamic = 0.389749 W
          Branch Predictor:
            Area = 0.474272 mm^2
            Peak Dynamic = 0.0682449 W
            Subthreshold Leakage = 0.00901262 W
            Gate Leakage = 0.00067136 W
            Runtime Dynamic = 0.0636543 W
              Global Predictor:
                Area = 0.190297 mm^2
                Peak Dynamic = 0.0224229 W
                Subthreshold Leakage = 0.00351842 W
                Gate Leakage = 0.000260107 W
                Runtime Dynamic = 0.0239711 W
              Local Predictor:
                Area = 0.0959237 mm^2
                Peak Dynamic = 0.0143301 W
                Subthreshold Leakage = 0.00171829 W
                Gate Leakage = 0.00012889 W
                Runtime Dynamic = 0.015711 W
                Area = 0.0484908 mm^2
                Peak Dynamic = 0.0077514 W
                Subthreshold Leakage = 0.000926283 W
                Gate Leakage = 7.55051e-05 W
                Runtime Dynamic = 0.00850163 W
              Chooser:
                Area = 0.190297 mm^2
                Peak Dynamic = 0.0224229 W
                Subthreshold Leakage = 0.00351842 W
                Gate Leakage = 0.000260107 W
                Runtime Dynamic = 0.0239711 W
              RAS:
                Area = 0.0451868 mm^2
                Peak Dynamic = 0.00906891 W
                Subthreshold Leakage = 0.00025749 W
                Gate Leakage = 2.22565e-05 W
                Runtime Dynamic = 1.06361e-06 W
          Instruction Buffer:
            Area = 0.11139 mm^2
            Peak Dynamic = 0.30298 W
            Subthreshold Leakage = 0.000556928 W
            Gate Leakage = 4.34124e-05 W
            Runtime Dynamic = 0.201987 W
          Instruction Decoder:
            Area = 0.0481902 mm^2
            Peak Dynamic = 0.677465 W
            Subthreshold Leakage = 0.00135195 W
            Gate Leakage = 0.000132907 W
            Runtime Dynamic = 0.677465 W
      Renaming Unit:
        Area = 4.5037 mm^2
        Peak Dynamic = 4.11785 W
        Subthreshold Leakage = 0.0296009 W
        Gate Leakage = 0.00668098 W
        Runtime Dynamic = 3.24944 W
          Int Front End RAT:
            Area = 2.76467 mm^2
            Peak Dynamic = 2.43279 W
            Subthreshold Leakage = 0.0129405 W
            Gate Leakage = 0.00255854 W
            Runtime Dynamic = 2.43279 W
          FP Front End RAT:
            Area = 1.39233 mm^2
            Peak Dynamic = 1.35403 W
            Subthreshold Leakage = 0.00981219 W
            Gate Leakage = 0.00205621 W
            Runtime Dynamic = 0.677017 W
          Free List:
            Area = 0.116928 mm^2
            Peak Dynamic = 0.0436483 W
            Subthreshold Leakage = 0.000259915 W
            Gate Leakage = 2.53395e-05 W
            Runtime Dynamic = 0.0872966 W
          Int Retire RAT: 
            Area = 0.0429772 mm^2
            Peak Dynamic = 0.0318091 W
            Subthreshold Leakage = 0.000152798 W
            Gate Leakage = 1.86722e-05 W
            Runtime Dynamic = 0.0318091 W
          FP Retire RAT:
            Area = 0.0153516 mm^2
            Peak Dynamic = 0.00997874 W
            Subthreshold Leakage = 8.06509e-05 W
            Gate Leakage = 7.17049e-06 W
            Runtime Dynamic = 0.00498937 W
          FP Free List:
            Area = 0.0530951 mm^2
            Peak Dynamic = 0.0310624 W
            Subthreshold Leakage = 0.000140326 W
            Gate Leakage = 1.46766e-05 W
            Runtime Dynamic = 0.0155312 W
      Load Store Unit:
        Area = 20.5622 mm^2
        Peak Dynamic = 5.14439 W
        Subthreshold Leakage = 0.207699 W
        Gate Leakage = 0.0357344 W
        Runtime Dynamic = 16.0217 W
          Data Cache:
            Area = 15.2468 mm^2
            Peak Dynamic = 4.5468 W
            Subthreshold Leakage = 0.19694 W
            Gate Leakage = 0.0331746 W
            Runtime Dynamic = 15.8781 W
          LoadQ:
            Area = 0.863734 mm^2
            Peak Dynamic = 0.191536 W
            Subthreshold Leakage = 0.00227213 W
            Gate Leakage = 0.000279753 W
            Runtime Dynamic = 0.047884 W
          StoreQ:
            Area = 0.863734 mm^2
            Peak Dynamic = 0.191536 W
            Subthreshold Leakage = 0.00227213 W
            Gate Leakage = 0.000279753 W
            Runtime Dynamic = 0.0957681 W
      Memory Management Unit:
        Area = 3.49533 mm^2
        Peak Dynamic = 1.34391 W
        Subthreshold Leakage = 0.0412098 W
        Gate Leakage = 0.00931467 W
        Runtime Dynamic = 2.25879 W
          Itlb:
            Area = 1.12903 mm^2
            Peak Dynamic = 0.425717 W
            Subthreshold Leakage = 0.0152632 W
            Gate Leakage = 0.00308734 W
            Runtime Dynamic = 0.851444 W
          Dtlb:
            Area = 2.24796 mm^2
            Peak Dynamic = 0.703668 W
            Subthreshold Leakage = 0.0197321 W
            Gate Leakage = 0.00422696 W
            Runtime Dynamic = 1.40735 W
      Execution Unit:
        Area = 18.9802 mm^2
        Peak Dynamic = 6.91426 W
        Subthreshold Leakage = 1.01207 W
        Gate Leakage = 0.130415 W
        Runtime Dynamic = 5.51245 W
          Register Files:
            Area = 4.63431 mm^2
            Peak Dynamic = 1.07973 W
            Subthreshold Leakage = 0.00557121 W
            Gate Leakage = 0.000534421 W
            Runtime Dynamic = 0.491409 W
              Integer RF:
                Area = 3.11444 mm^2
                Peak Dynamic = 0.64479 W
                Subthreshold Leakage = 0.00348926 W
                Gate Leakage = 0.000338898 W
                Runtime Dynamic = 0.43963 W
              Floating Point RF:
                Area = 1.51987 mm^2
                Peak Dynamic = 0.434944 W
                Subthreshold Leakage = 0.00208194 W
                Gate Leakage = 0.000195523 W
                Runtime Dynamic = 0.051779 W
          Instruction Scheduler:
            Area = 2.2958 mm^2
            Peak Dynamic = 0.682653 W
            Subthreshold Leakage = 0.0043779 W
            Gate Leakage = 0.000496354 W
            Runtime Dynamic = 0.783433 W
              Instruction Window:
                Area = 0.416485 mm^2
                Peak Dynamic = 0.230852 W
                Subthreshold Leakage = 0.001531 W
                Gate Leakage = 0.000214549 W
                Runtime Dynamic = 0.308242 W
              FP Instruction Window:
                Area = 0.160067 mm^2
                Peak Dynamic = 0.0899719 W
                Subthreshold Leakage = 0.000573841 W
                Gate Leakage = 9.08104e-05 W
                Runtime Dynamic = 0.113361 W
              ROB:
                Area = 1.71925 mm^2
                Peak Dynamic = 0.361829 W
                Subthreshold Leakage = 0.00227307 W
                Gate Leakage = 0.000190995 W
                Runtime Dynamic = 0.361829 W
          Integer ALUs (Count: 4 ):
            Area = 2.56256 mm^2
            Peak Dynamic = 1.45952 W
            Subthreshold Leakage = 0.514377 W
            Gate Leakage = 0.0657924 W
            Runtime Dynamic = 1.12031 W
          Floating Point Units (FPUs) (Count: 1 ):
            Area = 9.317 mm^2
            Peak Dynamic = 1.32571 W
            Subthreshold Leakage = 0.467545 W
            Gate Leakage = 0.0598023 W
            Runtime Dynamic = 1.32571 W
          Results Broadcast Bus:
            Area Overhead = 0.0521609 mm^2
            Peak Dynamic = 2.15212 W
            Subthreshold Leakage = 0.0139887 W
            Gate Leakage = 0.00178925 W
            Runtime Dynamic = 1.79159 W
 *****************************************************************************************
 L2
      Area = 62.2653 mm^2
      Peak Dynamic = 1.42987 W
      Subthreshold Leakage = 1.65481 W
      Gate Leakage = 0.00860545 W
      Runtime Dynamic = 2.73329 W
 *****************************************************************************************
 Second Level Directory
      Area = 0.533824 mm^2
      Peak Dynamic = 0.275566 W
      Subthreshold Leakage = 0.00929753 W
      Gate Leakage = 0.00179126 W
      Runtime Dynamic = 0.193681 W
 *****************************************************************************************
 Memory Controller:
      Area = 3.54463 mm^2
      Peak Dynamic = 1.65655 W
      Subthreshold Leakage = 0.0225194 W
      Gate Leakage = 0.00285586 W
      Runtime Dynamic = 0.573656 W
      Front End Engine:
        Area = 1.72828 mm^2
        Peak Dynamic = 0.389588 W
        Subthreshold Leakage = 0.00246696 W
        Gate Leakage = 0.000291005 W
        Runtime Dynamic = 0.0911898 W
      Transaction Engine:
        Area = 0.75308 mm^2
        Peak Dynamic = 1.13896 W
        Subthreshold Leakage = 0.00831402 W
        Gate Leakage = 0.00106342 W
        Runtime Dynamic = 0.341678 W
      PHY:
        Area = 1.06326 mm^2
        Peak Dynamic = 0.128 W
        Subthreshold Leakage = 0.0117384 W
        Gate Leakage = 0.00150143 W
        Runtime Dynamic = 0.140788 W
 *****************************************************************************************
 NOC
      Area = 8.77595 mm^2
      Peak Dynamic = 6.17873 W
      Subthreshold Leakage = 0.108357 W
      Gate Leakage = 0.0139259 W
      Runtime Dynamic = 0.963385 W
      Router: 
        Area = 8.3047 mm^2
        Peak Dynamic = 2.78895 W
        Subthreshold Leakage = 0.0606175 W
        Gate Leakage = 0.00781974 W
        Runtime Dynamic = 0.398421 W
            Virtual Channel Buffer:
              Area = 4.2978 mm^2
              Peak Dynamic = 2.31409 W
              Subthreshold Leakage = 0.028002 W
              Gate Leakage = 0.00227471 W
              Runtime Dynamic = 0.330584 W
            Crossbar:
              Area = 0.160538 mm^2
              Peak Dynamic = 0.437862 W
              Subthreshold Leakage = 0.0325996 W
              Gate Leakage = 0.00554292 W
              Runtime Dynamic = 0.0625517 W
            Arbiter:
              Peak Dynamic = 0.0370018 W
              Subthreshold Leakage = 1.5858e-05 W
              Gate Leakage = 2.11117e-06 W
              Runtime Dynamic = 0.00528597 W
      Per Router Links: 
        Area = 0.471256 mm^2
        Peak Dynamic = 3.38978 W
        Subthreshold Leakage = 0.0477391 W
        Gate Leakage = 0.00610616 W
        Runtime Dynamic = 0.564963 W
 *****************************************************************************************
--- a/ext/mcpat/results/Penryn
+++ b/ext/mcpat/results/Penryn
@ -0,0 +1,315 @@
 McPAT (version 0.8 of Aug, 2010) is computing the target processor...
 McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
 *****************************************************************************************
  Technology 45 nm
  Using Long Channel Devices When Appropriate
  Interconnect metal projection= aggressive interconnect technology projection
  Core clock Rate(MHz) 3700
 *****************************************************************************************
 Processor: 
  Area = 92.2661 mm^2
  Peak Power = 61.0228 W
  Total Leakage = 10.8609 W
  Peak Dynamic = 50.1619 W
  Subthreshold Leakage = 10.2773 W
  Gate Leakage = 0.583567 W
  Runtime Dynamic = 69.6347 W
  Total Cores: 2 cores 
  Device Type= ITRS high performance device type
    Area = 48.2438 mm^2
    Peak Dynamic = 39.6676 W
    Subthreshold Leakage = 6.96165 W
    Gate Leakage = 0.541077 W
    Runtime Dynamic = 51.4987 W
  Total L2s: 
  Device Type= ITRS high performance device type
    Area = 43.1009 mm^2
    Peak Dynamic = 6.43272 W
    Subthreshold Leakage = 3.28049 W
    Gate Leakage = 0.0386655 W
    Runtime Dynamic = 13.716 W
  Total NoCs (Network/Bus): 
  Device Type= ITRS high performance device type
    Area = 0.921404 mm^2
    Peak Dynamic = 4.06164 W
    Subthreshold Leakage = 0.035183 W
    Gate Leakage = 0.00382481 W
    Runtime Dynamic = 4.42002 W
 *****************************************************************************************
 Core:
      Area = 24.1219 mm^2
      Peak Dynamic = 19.8338 W
      Subthreshold Leakage = 3.48083 W
      Gate Leakage = 0.270538 W
      Runtime Dynamic = 51.4987 W
      Instruction Fetch Unit:
        Area = 3.13582 mm^2
        Peak Dynamic = 2.49774 W
        Subthreshold Leakage = 0.421089 W
        Gate Leakage = 0.0246791 W
        Runtime Dynamic = 2.42869 W
          Instruction Cache:
            Area = 0.702441 mm^2
            Peak Dynamic = 0.419702 W
            Subthreshold Leakage = 0.0413175 W
            Gate Leakage = 0.00175164 W
            Runtime Dynamic = 0.487111 W
          Branch Target Buffer:
            Area = 0.349484 mm^2
            Peak Dynamic = 0.0903353 W
            Subthreshold Leakage = 0.0243658 W
            Gate Leakage = 0.000966387 W
            Runtime Dynamic = 0.361341 W
          Branch Predictor:
            Area = 0.153017 mm^2
            Peak Dynamic = 0.0718712 W
            Subthreshold Leakage = 0.0142615 W
            Gate Leakage = 0.000619154 W
            Runtime Dynamic = 0.0647272 W
              Global Predictor:
                Area = 0.0475693 mm^2
                Peak Dynamic = 0.0231158 W
                Subthreshold Leakage = 0.00544747 W
                Gate Leakage = 0.000234591 W
                Runtime Dynamic = 0.0245764 W
              Local Predictor:
              L1_Local Predictor:
                Area = 0.0239764 mm^2
                Peak Dynamic = 0.0142817 W
                Subthreshold Leakage = 0.00265926 W
                Gate Leakage = 0.00011608 W
                Runtime Dynamic = 0.0155731 W
              L2_Local Predictor:
                Area = 0.012121 mm^2
                Peak Dynamic = 0.00767395 W
                Subthreshold Leakage = 0.00143248 W
                Gate Leakage = 6.77717e-05 W
                Runtime Dynamic = 0.00837399 W
              Chooser:
                Area = 0.0475693 mm^2
                Peak Dynamic = 0.0231158 W
                Subthreshold Leakage = 0.00544747 W
                Gate Leakage = 0.000234591 W
                Runtime Dynamic = 0.0245764 W
              RAS:
                Area = 0.0217815 mm^2
                Peak Dynamic = 0.0113578 W
                Subthreshold Leakage = 0.000707258 W
                Gate Leakage = 3.38921e-05 W
                Runtime Dynamic = 1.2459e-06 W
          Instruction Buffer:
            Area = 0.0278406 mm^2
            Peak Dynamic = 0.282368 W
            Subthreshold Leakage = 0.000861686 W
            Gate Leakage = 3.91839e-05 W
            Runtime Dynamic = 0.188245 W
          Instruction Decoder:
            Area = 1.85799 mm^2
            Peak Dynamic = 1.32726 W
            Subthreshold Leakage = 0.325606 W
            Gate Leakage = 0.0185411 W
            Runtime Dynamic = 1.32726 W
      Renaming Unit:
        Area = 1.02517 mm^2
        Peak Dynamic = 2.25746 W
        Subthreshold Leakage = 0.042129 W
        Gate Leakage = 0.00480502 W
        Runtime Dynamic = 1.55315 W
          Int Front End RAT:
            Area = 0.59725 mm^2
            Peak Dynamic = 1.25286 W
            Subthreshold Leakage = 0.0159587 W
            Gate Leakage = 0.00122436 W
            Runtime Dynamic = 1.11309 W
          FP Front End RAT:
            Area = 0.350662 mm^2
            Peak Dynamic = 0.652971 W
            Subthreshold Leakage = 0.0110219 W
            Gate Leakage = 0.00079321 W
            Runtime Dynamic = 0.326485 W
          Free List:
            Area = 0.0322035 mm^2
            Peak Dynamic = 0.0454309 W
            Subthreshold Leakage = 0.000471802 W
            Gate Leakage = 2.57995e-05 W
            Runtime Dynamic = 0.113577 W
      Load Store Unit:
        Area = 7.24152 mm^2
        Peak Dynamic = 6.57278 W
        Subthreshold Leakage = 0.310798 W
        Gate Leakage = 0.0358085 W
        Runtime Dynamic = 34.9208 W
          Data Cache:
            Area = 4.65034 mm^2
            Peak Dynamic = 5.03369 W
            Subthreshold Leakage = 0.237004 W
            Gate Leakage = 0.0253255 W
            Runtime Dynamic = 33.601 W
          LoadQ:
            Area = 0.260806 mm^2
            Peak Dynamic = 0.132332 W
            Subthreshold Leakage = 0.00523814 W
            Gate Leakage = 0.000359005 W
            Runtime Dynamic = 0.0661662 W
          StoreQ:
            Area = 1.06006 mm^2
            Peak Dynamic = 1.25365 W
            Subthreshold Leakage = 0.0538794 W
            Gate Leakage = 0.00736236 W
            Runtime Dynamic = 1.25365 W
      Memory Management Unit:
        Area = 0.363299 mm^2
        Peak Dynamic = 0.610831 W
        Subthreshold Leakage = 0.0388017 W
        Gate Leakage = 0.00431691 W
        Runtime Dynamic = 1.29234 W
          Itlb:
            Area = 0.0590462 mm^2
            Peak Dynamic = 0.116192 W
            Subthreshold Leakage = 0.00608044 W
            Gate Leakage = 0.000398475 W
            Runtime Dynamic = 0.232386 W
          Dtlb:
            Area = 0.259199 mm^2
            Peak Dynamic = 0.264986 W
            Subthreshold Leakage = 0.0180446 W
            Gate Leakage = 0.00115678 W
            Runtime Dynamic = 1.05995 W
      Execution Unit:
        Area = 7.9594 mm^2
        Peak Dynamic = 7.89497 W
        Subthreshold Leakage = 1.28761 W
        Gate Leakage = 0.0977152 W
        Runtime Dynamic = 11.3037 W
          Register Files:
            Area = 0.528076 mm^2
            Peak Dynamic = 0.554172 W
            Subthreshold Leakage = 0.00459231 W
            Gate Leakage = 0.000305031 W
            Runtime Dynamic = 0.283985 W
              Integer RF:
                Area = 0.336446 mm^2
                Peak Dynamic = 0.461344 W
                Subthreshold Leakage = 0.00257976 W
                Gate Leakage = 0.00018025 W
                Runtime Dynamic = 0.247149 W
              Floating Point RF:
                Area = 0.19163 mm^2
                Peak Dynamic = 0.0928276 W
                Subthreshold Leakage = 0.00201255 W
                Gate Leakage = 0.000124781 W
                Runtime Dynamic = 0.0368364 W
          Instruction Scheduler:
            Area = 1.97424 mm^2
            Peak Dynamic = 1.76421 W
            Subthreshold Leakage = 0.0212898 W
            Gate Leakage = 0.0014052 W
            Runtime Dynamic = 1.96388 W
              Instruction Window:
                Area = 0.889691 mm^2
                Peak Dynamic = 0.468182 W
                Subthreshold Leakage = 0.0081033 W
                Gate Leakage = 0.000620258 W
                Runtime Dynamic = 0.601258 W
              FP Instruction Window:
                Area = 0.347423 mm^2
                Peak Dynamic = 0.230453 W
                Subthreshold Leakage = 0.00381664 W
                Gate Leakage = 0.000293336 W
                Runtime Dynamic = 0.29704 W
              ROB:
                Area = 0.737129 mm^2
                Peak Dynamic = 1.06558 W
                Subthreshold Leakage = 0.00936988 W
                Gate Leakage = 0.000491606 W
                Runtime Dynamic = 1.06558 W
          Integer ALUs (Count: 6 ):
            Area = 0.47087 mm^2
            Peak Dynamic = 2.2206 W
            Subthreshold Leakage = 0.295671 W
            Gate Leakage = 0.0221076 W
            Runtime Dynamic = 1.14549 W
          Floating Point Units (FPUs) (Count: 2 ):
            Area = 4.6585 mm^2
            Peak Dynamic = 0.708407 W
            Subthreshold Leakage = 0.731296 W
            Gate Leakage = 0.0546797 W
            Runtime Dynamic = 1.28625 W
          Complex ALUs (Mul/Div) (Count: 1 ):
            Area = 0.235435 mm^2
            Peak Dynamic = 0.257249 W
            Subthreshold Leakage = 0.147835 W
            Gate Leakage = 0.0110538 W
            Runtime Dynamic = 1.57424 W
          Results Broadcast Bus:
            Area Overhead = 0.0472187 mm^2
            Peak Dynamic = 2.08413 W
            Subthreshold Leakage = 0.0722513 W
            Gate Leakage = 0.00540229 W
            Runtime Dynamic = 5.04986 W
 *****************************************************************************************
 L2
      Area = 43.1009 mm^2
      Peak Dynamic = 6.43272 W
      Subthreshold Leakage = 3.28049 W
      Gate Leakage = 0.0386655 W
      Runtime Dynamic = 13.716 W
 *****************************************************************************************
 BUSES
      Area = 0.921404 mm^2
      Peak Dynamic = 4.06164 W
      Subthreshold Leakage = 0.035183 W
      Gate Leakage = 0.00382481 W
      Runtime Dynamic = 4.42002 W
      Bus: 
        Area = 0.921404 mm^2
        Peak Dynamic = 4.06164 W
        Subthreshold Leakage = 0.035183 W
        Gate Leakage = 0.00382481 W
        Runtime Dynamic = 4.42002 W
 *****************************************************************************************
--- a/ext/mcpat/results/T1
+++ b/ext/mcpat/results/T1
@ -0,0 +1,296 @@
 McPAT (version 0.8 of Aug, 2010) is computing the target processor...
 McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
 *****************************************************************************************
  Technology 90 nm
  Using Long Channel Devices When Appropriate
  Interconnect metal projection= aggressive interconnect technology projection
  Core clock Rate(MHz) 1200
 *****************************************************************************************
 Processor: 
  Area = 283.287 mm^2
  Peak Power = 55.0318 W
  Total Leakage = 9.78078 W
  Peak Dynamic = 45.2511 W
  Subthreshold Leakage = 8.64906 W
  Gate Leakage = 1.13172 W
  Runtime Dynamic = 45.5013 W
  Total Cores: 
  Device Type= ITRS high performance device type
    Area = 117.887 mm^2
    Peak Dynamic = 28.1307 W
    Subthreshold Leakage = 5.19354 W
    Gate Leakage = 0.730037 W
    Runtime Dynamic = 18.917 W
  Total L2s: 
  Device Type= ITRS high performance device type
    Area = 116.308 mm^2
    Peak Dynamic = 5.51367 W
    Subthreshold Leakage = 2.41316 W
    Gate Leakage = 0.242513 W
    Runtime Dynamic = 4.00707 W
  Total First Level Directory: 
  Device Type= ITRS high performance device type
    Area = 8.77473 mm^2
    Peak Dynamic = 3.38588 W
    Subthreshold Leakage = 0.224524 W
    Gate Leakage = 0.0320801 W
    Runtime Dynamic = 15.1158 W
  Total NoCs (Network/Bus): 
  Device Type= ITRS high performance device type
    Area = 8.87598 mm^2
    Peak Dynamic = 3.67515 W
    Subthreshold Leakage = 0.488892 W
    Gate Leakage = 0.0852308 W
    Runtime Dynamic = 2.20509 W
  Total MCs: 
  Device Type= ITRS high performance device type
    Area = 31.441 mm^2
    Peak Dynamic = 4.5457 W
    Subthreshold Leakage = 0.328953 W
    Gate Leakage = 0.0418558 W
    Runtime Dynamic = 5.25637 W
 *****************************************************************************************
 Core:
      Area = 14.7359 mm^2
      Peak Dynamic = 3.51633 W
      Subthreshold Leakage = 0.649192 W
      Gate Leakage = 0.0912546 W
      Runtime Dynamic = 18.917 W
      Instruction Fetch Unit:
        Area = 3.60967 mm^2
        Peak Dynamic = 0.560912 W
        Subthreshold Leakage = 0.0396492 W
        Gate Leakage = 0.00709504 W
        Runtime Dynamic = 3.76593 W
          Instruction Cache:
            Area = 3.41818 mm^2
            Peak Dynamic = 0.308492 W
            Subthreshold Leakage = 0.0286475 W
            Gate Leakage = 0.00418329 W
            Runtime Dynamic = 0.95332 W
          Instruction Buffer:
            Area = 0.0122742 mm^2
            Peak Dynamic = 0.0121268 W
            Subthreshold Leakage = 0.0002042 W
            Gate Leakage = 1.78658e-05 W
            Runtime Dynamic = 0.0970143 W
          Instruction Decoder:
            Area = 0.0229327 mm^2
            Peak Dynamic = 0.169467 W
            Subthreshold Leakage = 0.00259055 W
            Gate Leakage = 0.000252139 W
            Runtime Dynamic = 1.35574 W
      Load Store Unit:
        Area = 3.07616 mm^2
        Peak Dynamic = 0.390349 W
        Subthreshold Leakage = 0.0362126 W
        Gate Leakage = 0.00713432 W
        Runtime Dynamic = 3.85623 W
          Data Cache:
            Area = 1.47986 mm^2
            Peak Dynamic = 0.191211 W
            Subthreshold Leakage = 0.0157454 W
            Gate Leakage = 0.00208738 W
            Runtime Dynamic = 0.443377 W
          Load/Store Queue:
            Area = 1.17458 mm^2
            Peak Dynamic = 0.128312 W
            Subthreshold Leakage = 0.0122603 W
            Gate Leakage = 0.0024052 W
            Runtime Dynamic = 2.05299 W
      Memory Management Unit:
        Area = 1.27751 mm^2
        Peak Dynamic = 0.324071 W
        Subthreshold Leakage = 0.0192968 W
        Gate Leakage = 0.0049902 W
        Runtime Dynamic = 2.53591 W
          Itlb:
            Area = 0.560615 mm^2
            Peak Dynamic = 0.117604 W
            Subthreshold Leakage = 0.00554488 W
            Gate Leakage = 0.00117423 W
            Runtime Dynamic = 0.940838 W
          Dtlb:
            Area = 0.560615 mm^2
            Peak Dynamic = 0.0294011 W
            Subthreshold Leakage = 0.00554488 W
            Gate Leakage = 0.00117423 W
            Runtime Dynamic = 0.235211 W
      Execution Unit:
        Area = 3.47025 mm^2
        Peak Dynamic = 2.241 W
        Subthreshold Leakage = 0.222601 W
        Gate Leakage = 0.0296426 W
        Runtime Dynamic = 8.75894 W
          Register Files:
            Area = 1.38355 mm^2
            Peak Dynamic = 0.0746572 W
            Subthreshold Leakage = 0.00827136 W
            Gate Leakage = 0.000628178 W
            Runtime Dynamic = 0.320633 W
              Integer RF:
                Area = 0.592652 mm^2
                Peak Dynamic = 0.0582404 W
                Subthreshold Leakage = 0.00161128 W
                Gate Leakage = 0.000148771 W
                Runtime Dynamic = 0.312722 W
              Floating Point RF:
                Area = 0.592652 mm^2
                Peak Dynamic = 0.0164168 W
                Subthreshold Leakage = 0.00161128 W
                Gate Leakage = 0.000148771 W
                Runtime Dynamic = 0.00783962 W
              Register Windows:
                Area = 0.198243 mm^2
                Peak Dynamic = 0 W
                Subthreshold Leakage = 0.00504879 W
                Gate Leakage = 0.000330636 W
                Runtime Dynamic = 7.11291e-05 W
          Instruction Scheduler:
            Area = 0.04377 mm^2
            Peak Dynamic = 0.0284368 W
            Subthreshold Leakage = 0.000336066 W
            Gate Leakage = 5.10703e-05 W
            Runtime Dynamic = 0.244528 W
              Instruction Window:
                Area = 0.04377 mm^2
                Peak Dynamic = 0.0284368 W
                Subthreshold Leakage = 0.000336066 W
                Gate Leakage = 5.10703e-05 W
                Runtime Dynamic = 0.244528 W
          Integer ALUs (Count: 1 ):
            Area = 0.16016 mm^2
            Peak Dynamic = 0.305285 W
            Subthreshold Leakage = 0.0321485 W
            Gate Leakage = 0.00411202 W
            Runtime Dynamic = 2.71365 W
          Floating Point Units (FPUs) (Count: 0.125 ):
            Area = 1.16463 mm^2
            Peak Dynamic = 0.0508808 W
            Subthreshold Leakage = 0.0584431 W
            Gate Leakage = 0.00747528 W
            Runtime Dynamic = 0.101762 W
          Complex ALUs (Mul/Div) (Count: 1 ):
            Area = 0.48048 mm^2
            Peak Dynamic = 0.339206 W
            Subthreshold Leakage = 0.0964456 W
            Gate Leakage = 0.0123361 W
            Runtime Dynamic = 0.678411 W
          Results Broadcast Bus:
            Area Overhead = 0.0813807 mm^2
            Peak Dynamic = 1.18756 W
            Subthreshold Leakage = 0.0187498 W
            Gate Leakage = 0.00239823 W
            Runtime Dynamic = 3.3401 W
 *****************************************************************************************
 L2
      Area = 29.0771 mm^2
      Peak Dynamic = 1.37842 W
      Subthreshold Leakage = 0.603289 W
      Gate Leakage = 0.0606283 W
      Runtime Dynamic = 4.00707 W
 *****************************************************************************************
 First Level Directory
      Area = 2.19368 mm^2
      Peak Dynamic = 0.84647 W
      Subthreshold Leakage = 0.0561311 W
      Gate Leakage = 0.00802003 W
      Runtime Dynamic = 15.1158 W
 *****************************************************************************************
 Memory Controller:
      Area = 7.86025 mm^2
      Peak Dynamic = 1.13642 W
      Subthreshold Leakage = 0.0822383 W
      Gate Leakage = 0.0104639 W
      Runtime Dynamic = 5.25637 W
      Front End Engine:
        Area = 0.63078 mm^2
        Peak Dynamic = 0.0549429 W
        Subthreshold Leakage = 0.00242476 W
        Gate Leakage = 0.00025524 W
        Runtime Dynamic = 0.241753 W
      Transaction Engine:
        Area = 2.59502 mm^2
        Peak Dynamic = 0.569482 W
        Subthreshold Leakage = 0.0286491 W
        Gate Leakage = 0.00366442 W
        Runtime Dynamic = 2.50577 W
      PHY:
        Area = 4.63445 mm^2
        Peak Dynamic = 0.512 W
        Subthreshold Leakage = 0.0511644 W
        Gate Leakage = 0.00654429 W
        Runtime Dynamic = 2.50885 W
 *****************************************************************************************
 NOC
      Area = 8.87598 mm^2
      Peak Dynamic = 3.67515 W
      Subthreshold Leakage = 0.488892 W
      Gate Leakage = 0.0852308 W
      Runtime Dynamic = 2.20509 W
      Router: 
        Area = 4.43799 mm^2
        Peak Dynamic = 1.83757 W
        Subthreshold Leakage = 0.244446 W
        Gate Leakage = 0.0426154 W
        Runtime Dynamic = 2.20509 W
            Virtual Channel Buffer:
              Area = 1.22928 mm^2
              Peak Dynamic = 0.0508654 W
              Subthreshold Leakage = 0.000485491 W
              Gate Leakage = 7.24213e-05 W
              Runtime Dynamic = 0.0610385 W
            Crossbar:
              Area = 1.35717 mm^2
              Peak Dynamic = 1.77185 W
              Subthreshold Leakage = 0.243949 W
              Gate Leakage = 0.0425414 W
              Runtime Dynamic = 2.12622 W
            Arbiter:
              Peak Dynamic = 0.0148566 W
              Subthreshold Leakage = 1.15783e-05 W
              Gate Leakage = 1.54103e-06 W
              Runtime Dynamic = 0.0178279 W
 *****************************************************************************************
--- a/ext/mcpat/results/T1_DC_64
+++ b/ext/mcpat/results/T1_DC_64
@ -0,0 +1,270 @@
 McPAT (version 0.8 of Aug, 2010) is computing the target processor...
 line64
 size1.04858e+06
 line9
 size1.04858e+06
 McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
 *****************************************************************************************
  Technology 22 nm
  Using Long Channel Devices When Appropriate
  Interconnect metal projection= aggressive interconnect technology projection
  Core clock Rate(MHz) 3500
 *****************************************************************************************
 Processor: 
  Area = 322.362 mm^2
  Peak Power = 112.557 W
  Total Leakage = 28.0714 W
  Peak Dynamic = 84.4853 W
  Subthreshold Leakage = 27.7571 W
  Gate Leakage = 0.314289 W
  Runtime Dynamic = 13.4278 W
  Total Cores: 64 cores 
  Device Type= ITRS high performance device type
    Area = 87.1986 mm^2
    Peak Dynamic = 42.426 W
    Subthreshold Leakage = 7.80232 W
    Gate Leakage = 0.0799149 W
    Runtime Dynamic = 9.61388 W
  Total L2s: 
  Device Type= ITRS high performance device type
    Area = 161.532 mm^2
    Peak Dynamic = 21.1059 W
    Subthreshold Leakage = 8.9583 W
    Gate Leakage = 0.100733 W
    Runtime Dynamic = 1.14063 W
  Total First Level Directory: 
  Device Type= ITRS high performance device type
    Area = 22.1741 mm^2
    Peak Dynamic = 0.831407 W
    Subthreshold Leakage = 1.57123 W
    Gate Leakage = 0.0148674 W
    Runtime Dynamic = 0.175856 W
  Total NoCs (Network/Bus): 
  Device Type= ITRS high performance device type
    Area = 51.4571 mm^2
    Peak Dynamic = 20.122 W
    Subthreshold Leakage = 9.42527 W
    Gate Leakage = 0.118774 W
    Runtime Dynamic = 2.49747 W
 *****************************************************************************************
 Core:
      Area = 1.36248 mm^2
      Peak Dynamic = 0.662906 W
      Subthreshold Leakage = 0.121911 W
      Gate Leakage = 0.00124867 W
      Runtime Dynamic = 9.61388 W
      Instruction Fetch Unit:
        Area = 0.140786 mm^2
        Peak Dynamic = 0.0863256 W
        Subthreshold Leakage = 0.00636762 W
        Gate Leakage = 7.4998e-05 W
        Runtime Dynamic = 2.08883 W
          Instruction Cache:
            Area = 0.129377 mm^2
            Peak Dynamic = 0.0476007 W
            Subthreshold Leakage = 0.00381804 W
            Gate Leakage = 2.35266e-05 W
            Runtime Dynamic = 0.0698158 W
          Instruction Buffer:
            Area = 0.000754971 mm^2
            Peak Dynamic = 0.00238165 W
            Subthreshold Leakage = 4.99334e-05 W
            Gate Leakage = 3.27157e-07 W
            Runtime Dynamic = 0.0190532 W
          Instruction Decoder:
            Area = 0.00131543 mm^2
            Peak Dynamic = 0.0246042 W
            Subthreshold Leakage = 0.000538954 W
            Gate Leakage = 3.91915e-06 W
            Runtime Dynamic = 0.196833 W
      Load Store Unit:
        Area = 0.0977414 mm^2
        Peak Dynamic = 0.0587123 W
        Subthreshold Leakage = 0.00580883 W
        Gate Leakage = 7.48788e-05 W
        Runtime Dynamic = 2.07447 W
          Data Cache:
            Area = 0.0569223 mm^2
            Peak Dynamic = 0.0329939 W
            Subthreshold Leakage = 0.00249221 W
            Gate Leakage = 1.63814e-05 W
            Runtime Dynamic = 0.0476753 W
          Load/Store Queue:
            Area = 0.023444 mm^2
            Peak Dynamic = 0.0139792 W
            Subthreshold Leakage = 0.00135593 W
            Gate Leakage = 1.12722e-05 W
            Runtime Dynamic = 0.223667 W
      Memory Management Unit:
        Area = 0.0313997 mm^2
        Peak Dynamic = 0.0446647 W
        Subthreshold Leakage = 0.0029577 W
        Gate Leakage = 5.57335e-05 W
        Runtime Dynamic = 1.92566 W
          Itlb:
            Area = 0.0110306 mm^2
            Peak Dynamic = 0.0122535 W
            Subthreshold Leakage = 0.000498504 W
            Gate Leakage = 4.25417e-06 W
            Runtime Dynamic = 0.0980282 W
          Dtlb:
            Area = 0.0110306 mm^2
            Peak Dynamic = 0.00306337 W
            Subthreshold Leakage = 0.000498504 W
            Gate Leakage = 4.25417e-06 W
            Runtime Dynamic = 0.0245072 W
      Execution Unit:
        Area = 0.299667 mm^2
        Peak Dynamic = 0.473204 W
        Subthreshold Leakage = 0.0379242 W
        Gate Leakage = 0.000384077 W
        Runtime Dynamic = 3.52491 W
          Register Files:
            Area = 0.0598365 mm^2
            Peak Dynamic = 0.0168768 W
            Subthreshold Leakage = 0.0020814 W
            Gate Leakage = 1.24237e-05 W
            Runtime Dynamic = 0.072481 W
              Integer RF:
                Area = 0.0240072 mm^2
                Peak Dynamic = 0.0131657 W
                Subthreshold Leakage = 0.000449165 W
                Gate Leakage = 3.33111e-06 W
                Runtime Dynamic = 0.0706931 W
              Floating Point RF:
                Area = 0.0240072 mm^2
                Peak Dynamic = 0.00371113 W
                Subthreshold Leakage = 0.000449165 W
                Gate Leakage = 3.33111e-06 W
                Runtime Dynamic = 0.0017722 W
              Register Windows:
                Area = 0.0118221 mm^2
                Peak Dynamic = 0 W
                Subthreshold Leakage = 0.00118307 W
                Gate Leakage = 5.76149e-06 W
                Runtime Dynamic = 1.56951e-05 W
          Instruction Scheduler:
            Area = 0.00263062 mm^2
            Peak Dynamic = 0.00540689 W
            Subthreshold Leakage = 8.27524e-05 W
            Gate Leakage = 9.38261e-07 W
            Runtime Dynamic = 0.0464411 W
              Instruction Window:
                Area = 0.00263062 mm^2
                Peak Dynamic = 0.00540689 W
                Subthreshold Leakage = 8.27524e-05 W
                Gate Leakage = 9.38261e-07 W
                Runtime Dynamic = 0.0464411 W
          Integer ALUs (Count: 1 ):
            Area = 0.0384544 mm^2
            Peak Dynamic = 0.0946992 W
            Subthreshold Leakage = 0.00667865 W
            Gate Leakage = 6.39207e-05 W
            Runtime Dynamic = 0.841771 W
          Floating Point Units (FPUs) (Count: 0.125 ):
            Area = 0.0695899 mm^2
            Peak Dynamic = 0.0157832 W
            Subthreshold Leakage = 0.00302155 W
            Gate Leakage = 2.89189e-05 W
            Runtime Dynamic = 0.0315664 W
          Complex ALUs (Mul/Div) (Count: 1 ):
            Area = 0.115363 mm^2
            Peak Dynamic = 0.105221 W
            Subthreshold Leakage = 0.020036 W
            Gate Leakage = 0.000191762 W
            Runtime Dynamic = 0.210443 W
          Results Broadcast Bus:
            Area Overhead = 0.00445381 mm^2
            Peak Dynamic = 0.192955 W
            Subthreshold Leakage = 0.00406321 W
            Gate Leakage = 3.88886e-05 W
            Runtime Dynamic = 0.519078 W
 *****************************************************************************************
 L2
      Area = 2.52394 mm^2
      Peak Dynamic = 0.32978 W
      Subthreshold Leakage = 0.139973 W
      Gate Leakage = 0.00157395 W
      Runtime Dynamic = 1.14063 W
 *****************************************************************************************
 Second Level Directory
      Area = 2.77176 mm^2
      Peak Dynamic = 0.103926 W
      Subthreshold Leakage = 0.196403 W
      Gate Leakage = 0.00185842 W
      Runtime Dynamic = 0.175856 W
 *****************************************************************************************
 NOC
      Area = 51.4571 mm^2
      Peak Dynamic = 20.122 W
      Subthreshold Leakage = 9.42527 W
      Gate Leakage = 0.118774 W
      Runtime Dynamic = 2.49747 W
      Router: 
        Area = 0.578434 mm^2
        Peak Dynamic = 0.184548 W
        Subthreshold Leakage = 0.125515 W
        Gate Leakage = 0.0016409 W
        Runtime Dynamic = 1.32875 W
            Virtual Channel Buffer:
              Area = 0.159162 mm^2
              Peak Dynamic = 0.00394081 W
              Subthreshold Leakage = 0.000194478 W
              Gate Leakage = 1.84946e-06 W
              Runtime Dynamic = 0.0283738 W
            Crossbar:
              Area = 0.160976 mm^2
              Peak Dynamic = 0.179891 W
              Subthreshold Leakage = 0.12532 W
              Gate Leakage = 0.00163905 W
              Runtime Dynamic = 1.29522 W
            Arbiter:
              Peak Dynamic = 0.000716053 W
              Subthreshold Leakage = 3.67148e-07 W
              Gate Leakage = 3.86991e-09 W
              Runtime Dynamic = 0.00515558 W
      Per Router Links: 
        Area = 0.225583 mm^2
        Peak Dynamic = 0.129858 W
        Subthreshold Leakage = 0.0217549 W
        Gate Leakage = 0.000214933 W
        Runtime Dynamic = 1.16872 W
 *****************************************************************************************
--- a/ext/mcpat/results/T1_SBT_64
+++ b/ext/mcpat/results/T1_SBT_64
@ -0,0 +1,252 @@
 McPAT (version 0.8 of Aug, 2010) is computing the target processor...
 line72
 size1.17965e+06
 McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
 *****************************************************************************************
  Technology 22 nm
  Using Long Channel Devices When Appropriate
  Interconnect metal projection= aggressive interconnect technology projection
  Core clock Rate(MHz) 3500
 *****************************************************************************************
 Processor: 
  Area = 321.412 mm^2
  Peak Power = 114.076 W
  Total Leakage = 27.4353 W
  Peak Dynamic = 86.6406 W
  Subthreshold Leakage = 27.1256 W
  Gate Leakage = 0.309772 W
  Runtime Dynamic = 13.4064 W
  Total Cores: 64 cores 
  Device Type= ITRS high performance device type
    Area = 87.1986 mm^2
    Peak Dynamic = 42.426 W
    Subthreshold Leakage = 7.80232 W
    Gate Leakage = 0.0799149 W
    Runtime Dynamic = 9.61388 W
  Total L2s: 
  Device Type= ITRS high performance device type
    Area = 182.778 mm^2
    Peak Dynamic = 24.1051 W
    Subthreshold Leakage = 9.90006 W
    Gate Leakage = 0.111104 W
    Runtime Dynamic = 1.29686 W
  Total NoCs (Network/Bus): 
  Device Type= ITRS high performance device type
    Area = 51.4353 mm^2
    Peak Dynamic = 20.1095 W
    Subthreshold Leakage = 9.42317 W
    Gate Leakage = 0.118753 W
    Runtime Dynamic = 2.4957 W
 *****************************************************************************************
 Core:
      Area = 1.36248 mm^2
      Peak Dynamic = 0.662906 W
      Subthreshold Leakage = 0.121911 W
      Gate Leakage = 0.00124867 W
      Runtime Dynamic = 9.61388 W
      Instruction Fetch Unit:
        Area = 0.140786 mm^2
        Peak Dynamic = 0.0863256 W
        Subthreshold Leakage = 0.00636762 W
        Gate Leakage = 7.4998e-05 W
        Runtime Dynamic = 2.08883 W
          Instruction Cache:
            Area = 0.129377 mm^2
            Peak Dynamic = 0.0476007 W
            Subthreshold Leakage = 0.00381804 W
            Gate Leakage = 2.35266e-05 W
            Runtime Dynamic = 0.0698158 W
          Instruction Buffer:
            Area = 0.000754971 mm^2
            Peak Dynamic = 0.00238165 W
            Subthreshold Leakage = 4.99334e-05 W
            Gate Leakage = 3.27157e-07 W
            Runtime Dynamic = 0.0190532 W
          Instruction Decoder:
            Area = 0.00131543 mm^2
            Peak Dynamic = 0.0246042 W
            Subthreshold Leakage = 0.000538954 W
            Gate Leakage = 3.91915e-06 W
            Runtime Dynamic = 0.196833 W
      Load Store Unit:
        Area = 0.0977414 mm^2
        Peak Dynamic = 0.0587123 W
        Subthreshold Leakage = 0.00580883 W
        Gate Leakage = 7.48788e-05 W
        Runtime Dynamic = 2.07447 W
          Data Cache:
            Area = 0.0569223 mm^2
            Peak Dynamic = 0.0329939 W
            Subthreshold Leakage = 0.00249221 W
            Gate Leakage = 1.63814e-05 W
            Runtime Dynamic = 0.0476753 W
          Load/Store Queue:
            Area = 0.023444 mm^2
            Peak Dynamic = 0.0139792 W
            Subthreshold Leakage = 0.00135593 W
            Gate Leakage = 1.12722e-05 W
            Runtime Dynamic = 0.223667 W
      Memory Management Unit:
        Area = 0.0313997 mm^2
        Peak Dynamic = 0.0446647 W
        Subthreshold Leakage = 0.0029577 W
        Gate Leakage = 5.57335e-05 W
        Runtime Dynamic = 1.92566 W
          Itlb:
            Area = 0.0110306 mm^2
            Peak Dynamic = 0.0122535 W
            Subthreshold Leakage = 0.000498504 W
            Gate Leakage = 4.25417e-06 W
            Runtime Dynamic = 0.0980282 W
          Dtlb:
            Area = 0.0110306 mm^2
            Peak Dynamic = 0.00306337 W
            Subthreshold Leakage = 0.000498504 W
            Gate Leakage = 4.25417e-06 W
            Runtime Dynamic = 0.0245072 W
      Execution Unit:
        Area = 0.299667 mm^2
        Peak Dynamic = 0.473204 W
        Subthreshold Leakage = 0.0379242 W
        Gate Leakage = 0.000384077 W
        Runtime Dynamic = 3.52491 W
          Register Files:
            Area = 0.0598365 mm^2
            Peak Dynamic = 0.0168768 W
            Subthreshold Leakage = 0.0020814 W
            Gate Leakage = 1.24237e-05 W
            Runtime Dynamic = 0.072481 W
              Integer RF:
                Area = 0.0240072 mm^2
                Peak Dynamic = 0.0131657 W
                Subthreshold Leakage = 0.000449165 W
                Gate Leakage = 3.33111e-06 W
                Runtime Dynamic = 0.0706931 W
              Floating Point RF:
                Area = 0.0240072 mm^2
                Peak Dynamic = 0.00371113 W
                Subthreshold Leakage = 0.000449165 W
                Gate Leakage = 3.33111e-06 W
                Runtime Dynamic = 0.0017722 W
              Register Windows:
                Area = 0.0118221 mm^2
                Peak Dynamic = 0 W
                Subthreshold Leakage = 0.00118307 W
                Gate Leakage = 5.76149e-06 W
                Runtime Dynamic = 1.56951e-05 W
          Instruction Scheduler:
            Area = 0.00263062 mm^2
            Peak Dynamic = 0.00540689 W
            Subthreshold Leakage = 8.27524e-05 W
            Gate Leakage = 9.38261e-07 W
            Runtime Dynamic = 0.0464411 W
              Instruction Window:
                Area = 0.00263062 mm^2
                Peak Dynamic = 0.00540689 W
                Subthreshold Leakage = 8.27524e-05 W
                Gate Leakage = 9.38261e-07 W
                Runtime Dynamic = 0.0464411 W
          Integer ALUs (Count: 1 ):
            Area = 0.0384544 mm^2
            Peak Dynamic = 0.0946992 W
            Subthreshold Leakage = 0.00667865 W
            Gate Leakage = 6.39207e-05 W
            Runtime Dynamic = 0.841771 W
          Floating Point Units (FPUs) (Count: 0.125 ):
            Area = 0.0695899 mm^2
            Peak Dynamic = 0.0157832 W
            Subthreshold Leakage = 0.00302155 W
            Gate Leakage = 2.89189e-05 W
            Runtime Dynamic = 0.0315664 W
          Complex ALUs (Mul/Div) (Count: 1 ):
            Area = 0.115363 mm^2
            Peak Dynamic = 0.105221 W
            Subthreshold Leakage = 0.020036 W
            Gate Leakage = 0.000191762 W
            Runtime Dynamic = 0.210443 W
          Results Broadcast Bus:
            Area Overhead = 0.00445381 mm^2
            Peak Dynamic = 0.192955 W
            Subthreshold Leakage = 0.00406321 W
            Gate Leakage = 3.88886e-05 W
            Runtime Dynamic = 0.519078 W
 *****************************************************************************************
 L2
      Area = 2.85591 mm^2
      Peak Dynamic = 0.376642 W
      Subthreshold Leakage = 0.154688 W
      Gate Leakage = 0.001736 W
      Runtime Dynamic = 1.29686 W
 *****************************************************************************************
 NOC
      Area = 51.4353 mm^2
      Peak Dynamic = 20.1095 W
      Subthreshold Leakage = 9.42317 W
      Gate Leakage = 0.118753 W
      Runtime Dynamic = 2.4957 W
      Router: 
        Area = 0.578434 mm^2
        Peak Dynamic = 0.184548 W
        Subthreshold Leakage = 0.125515 W
        Gate Leakage = 0.0016409 W
        Runtime Dynamic = 1.32875 W
            Virtual Channel Buffer:
              Area = 0.159162 mm^2
              Peak Dynamic = 0.00394081 W
              Subthreshold Leakage = 0.000194478 W
              Gate Leakage = 1.84946e-06 W
              Runtime Dynamic = 0.0283738 W
            Crossbar:
              Area = 0.160976 mm^2
              Peak Dynamic = 0.179891 W
              Subthreshold Leakage = 0.12532 W
              Gate Leakage = 0.00163905 W
              Runtime Dynamic = 1.29522 W
            Arbiter:
              Peak Dynamic = 0.000716053 W
              Subthreshold Leakage = 3.67148e-07 W
              Gate Leakage = 3.86991e-09 W
              Runtime Dynamic = 0.00515558 W
      Per Router Links: 
        Area = 0.225243 mm^2
        Peak Dynamic = 0.129662 W
        Subthreshold Leakage = 0.0217221 W
        Gate Leakage = 0.000214609 W
        Runtime Dynamic = 1.16696 W
 *****************************************************************************************
--- a/ext/mcpat/results/T1_ST_64
+++ b/ext/mcpat/results/T1_ST_64
@ -0,0 +1,270 @@
 McPAT (version 0.8 of Aug, 2010) is computing the target processor...
 line64
 size1.04858e+06
 line9
 size8.38861e+06
 McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
 *****************************************************************************************
  Technology 22 nm
  Using Long Channel Devices When Appropriate
  Interconnect metal projection= aggressive interconnect technology projection
  Core clock Rate(MHz) 3500
 *****************************************************************************************
 Processor: 
  Area = 358.016 mm^2
  Peak Power = 168.519 W
  Total Leakage = 30.8855 W
  Peak Dynamic = 137.634 W
  Subthreshold Leakage = 30.5351 W
  Gate Leakage = 0.350385 W
  Runtime Dynamic = 84.2366 W
  Total Cores: 64 cores 
  Device Type= ITRS high performance device type
    Area = 87.1986 mm^2
    Peak Dynamic = 42.426 W
    Subthreshold Leakage = 7.80232 W
    Gate Leakage = 0.0799149 W
    Runtime Dynamic = 9.61388 W
  Total L2s: 
  Device Type= ITRS high performance device type
    Area = 161.532 mm^2
    Peak Dynamic = 21.1059 W
    Subthreshold Leakage = 8.9583 W
    Gate Leakage = 0.100733 W
    Runtime Dynamic = 1.14063 W
  Total First Level Directory: 
  Device Type= ITRS high performance device type
    Area = 57.033 mm^2
    Peak Dynamic = 53.5219 W
    Subthreshold Leakage = 4.27249 W
    Gate Leakage = 0.050206 W
    Runtime Dynamic = 70.9203 W
  Total NoCs (Network/Bus): 
  Device Type= ITRS high performance device type
    Area = 52.2524 mm^2
    Peak Dynamic = 20.5798 W
    Subthreshold Leakage = 9.50197 W
    Gate Leakage = 0.119531 W
    Runtime Dynamic = 2.56185 W
 *****************************************************************************************
 Core:
      Area = 1.36248 mm^2
      Peak Dynamic = 0.662906 W
      Subthreshold Leakage = 0.121911 W
      Gate Leakage = 0.00124867 W
      Runtime Dynamic = 9.61388 W
      Instruction Fetch Unit:
        Area = 0.140786 mm^2
        Peak Dynamic = 0.0863256 W
        Subthreshold Leakage = 0.00636762 W
        Gate Leakage = 7.4998e-05 W
        Runtime Dynamic = 2.08883 W
          Instruction Cache:
            Area = 0.129377 mm^2
            Peak Dynamic = 0.0476007 W
            Subthreshold Leakage = 0.00381804 W
            Gate Leakage = 2.35266e-05 W
            Runtime Dynamic = 0.0698158 W
          Instruction Buffer:
            Area = 0.000754971 mm^2
            Peak Dynamic = 0.00238165 W
            Subthreshold Leakage = 4.99334e-05 W
            Gate Leakage = 3.27157e-07 W
            Runtime Dynamic = 0.0190532 W
          Instruction Decoder:
            Area = 0.00131543 mm^2
            Peak Dynamic = 0.0246042 W
            Subthreshold Leakage = 0.000538954 W
            Gate Leakage = 3.91915e-06 W
            Runtime Dynamic = 0.196833 W
      Load Store Unit:
        Area = 0.0977414 mm^2
        Peak Dynamic = 0.0587123 W
        Subthreshold Leakage = 0.00580883 W
        Gate Leakage = 7.48788e-05 W
        Runtime Dynamic = 2.07447 W
          Data Cache:
            Area = 0.0569223 mm^2
            Peak Dynamic = 0.0329939 W
            Subthreshold Leakage = 0.00249221 W
            Gate Leakage = 1.63814e-05 W
            Runtime Dynamic = 0.0476753 W
          Load/Store Queue:
            Area = 0.023444 mm^2
            Peak Dynamic = 0.0139792 W
            Subthreshold Leakage = 0.00135593 W
            Gate Leakage = 1.12722e-05 W
            Runtime Dynamic = 0.223667 W
      Memory Management Unit:
        Area = 0.0313997 mm^2
        Peak Dynamic = 0.0446647 W
        Subthreshold Leakage = 0.0029577 W
        Gate Leakage = 5.57335e-05 W
        Runtime Dynamic = 1.92566 W
          Itlb:
            Area = 0.0110306 mm^2
            Peak Dynamic = 0.0122535 W
            Subthreshold Leakage = 0.000498504 W
            Gate Leakage = 4.25417e-06 W
            Runtime Dynamic = 0.0980282 W
          Dtlb:
            Area = 0.0110306 mm^2
            Peak Dynamic = 0.00306337 W
            Subthreshold Leakage = 0.000498504 W
            Gate Leakage = 4.25417e-06 W
            Runtime Dynamic = 0.0245072 W
      Execution Unit:
        Area = 0.299667 mm^2
        Peak Dynamic = 0.473204 W
        Subthreshold Leakage = 0.0379242 W
        Gate Leakage = 0.000384077 W
        Runtime Dynamic = 3.52491 W
          Register Files:
            Area = 0.0598365 mm^2
            Peak Dynamic = 0.0168768 W
            Subthreshold Leakage = 0.0020814 W
            Gate Leakage = 1.24237e-05 W
            Runtime Dynamic = 0.072481 W
              Integer RF:
                Area = 0.0240072 mm^2
                Peak Dynamic = 0.0131657 W
                Subthreshold Leakage = 0.000449165 W
                Gate Leakage = 3.33111e-06 W
                Runtime Dynamic = 0.0706931 W
              Floating Point RF:
                Area = 0.0240072 mm^2
                Peak Dynamic = 0.00371113 W
                Subthreshold Leakage = 0.000449165 W
                Gate Leakage = 3.33111e-06 W
                Runtime Dynamic = 0.0017722 W
              Register Windows:
                Area = 0.0118221 mm^2
                Peak Dynamic = 0 W
                Subthreshold Leakage = 0.00118307 W
                Gate Leakage = 5.76149e-06 W
                Runtime Dynamic = 1.56951e-05 W
          Instruction Scheduler:
            Area = 0.00263062 mm^2
            Peak Dynamic = 0.00540689 W
            Subthreshold Leakage = 8.27524e-05 W
            Gate Leakage = 9.38261e-07 W
            Runtime Dynamic = 0.0464411 W
              Instruction Window:
                Area = 0.00263062 mm^2
                Peak Dynamic = 0.00540689 W
                Subthreshold Leakage = 8.27524e-05 W
                Gate Leakage = 9.38261e-07 W
                Runtime Dynamic = 0.0464411 W
          Integer ALUs (Count: 1 ):
            Area = 0.0384544 mm^2
            Peak Dynamic = 0.0946992 W
            Subthreshold Leakage = 0.00667865 W
            Gate Leakage = 6.39207e-05 W
            Runtime Dynamic = 0.841771 W
          Floating Point Units (FPUs) (Count: 0.125 ):
            Area = 0.0695899 mm^2
            Peak Dynamic = 0.0157832 W
            Subthreshold Leakage = 0.00302155 W
            Gate Leakage = 2.89189e-05 W
            Runtime Dynamic = 0.0315664 W
          Complex ALUs (Mul/Div) (Count: 1 ):
            Area = 0.115363 mm^2
            Peak Dynamic = 0.105221 W
            Subthreshold Leakage = 0.020036 W
            Gate Leakage = 0.000191762 W
            Runtime Dynamic = 0.210443 W
          Results Broadcast Bus:
            Area Overhead = 0.00445381 mm^2
            Peak Dynamic = 0.192955 W
            Subthreshold Leakage = 0.00406321 W
            Gate Leakage = 3.88886e-05 W
            Runtime Dynamic = 0.519078 W
 *****************************************************************************************
 L2
      Area = 2.52394 mm^2
      Peak Dynamic = 0.32978 W
      Subthreshold Leakage = 0.139973 W
      Gate Leakage = 0.00157395 W
      Runtime Dynamic = 1.14063 W
 *****************************************************************************************
 Second Level Directory
      Area = 57.033 mm^2
      Peak Dynamic = 53.5219 W
      Subthreshold Leakage = 4.27249 W
      Gate Leakage = 0.050206 W
      Runtime Dynamic = 70.9203 W
 *****************************************************************************************
 NOC
      Area = 52.2524 mm^2
      Peak Dynamic = 20.5798 W
      Subthreshold Leakage = 9.50197 W
      Gate Leakage = 0.119531 W
      Runtime Dynamic = 2.56185 W
      Router: 
        Area = 0.578434 mm^2
        Peak Dynamic = 0.184548 W
        Subthreshold Leakage = 0.125515 W
        Gate Leakage = 0.0016409 W
        Runtime Dynamic = 1.32875 W
            Virtual Channel Buffer:
              Area = 0.159162 mm^2
              Peak Dynamic = 0.00394081 W
              Subthreshold Leakage = 0.000194478 W
              Gate Leakage = 1.84946e-06 W
              Runtime Dynamic = 0.0283738 W
            Crossbar:
              Area = 0.160976 mm^2
              Peak Dynamic = 0.179891 W
              Subthreshold Leakage = 0.12532 W
              Gate Leakage = 0.00163905 W
              Runtime Dynamic = 1.29522 W
            Arbiter:
              Peak Dynamic = 0.000716053 W
              Subthreshold Leakage = 3.67148e-07 W
              Gate Leakage = 3.86991e-09 W
              Runtime Dynamic = 0.00515558 W
      Per Router Links: 
        Area = 0.238009 mm^2
        Peak Dynamic = 0.137011 W
        Subthreshold Leakage = 0.0229533 W
        Gate Leakage = 0.000226773 W
        Runtime Dynamic = 1.2331 W
 *****************************************************************************************
--- a/ext/mcpat/results/T2
+++ b/ext/mcpat/results/T2
@ -0,0 +1,321 @@
 McPAT (version 0.8 of Aug, 2010) is computing the target processor...
 McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
 *****************************************************************************************
  Technology 65 nm
  Using Long Channel Devices When Appropriate
  Interconnect metal projection= aggressive interconnect technology projection
  Core clock Rate(MHz) 1400
 *****************************************************************************************
 Processor: 
  Area = 277.068 mm^2
  Peak Power = 71.8237 W
  Total Leakage = 18.2234 W
  Peak Dynamic = 53.6003 W
  Subthreshold Leakage = 14.7124 W
  Gate Leakage = 3.51096 W
  Runtime Dynamic = 48.652 W
  Total Cores: 8 cores 
  Device Type= ITRS high performance device type
    Area = 116.441 mm^2
    Peak Dynamic = 28.0277 W
    Subthreshold Leakage = 9.00023 W
    Gate Leakage = 1.93139 W
    Runtime Dynamic = 27.9237 W
  Total L2s: 
  Device Type= ITRS high performance device type
    Area = 85.0391 mm^2
    Peak Dynamic = 9.87481 W
    Subthreshold Leakage = 2.71188 W
    Gate Leakage = 0.684324 W
    Runtime Dynamic = 3.97632 W
  Total First Level Directory: 
  Device Type= ITRS high performance device type
    Area = 11.6417 mm^2
    Peak Dynamic = 5.32369 W
    Subthreshold Leakage = 0.249885 W
    Gate Leakage = 0.107486 W
    Runtime Dynamic = 5.38275 W
  Total NoCs (Network/Bus): 
  Device Type= ITRS high performance device type
    Area = 9.56584 mm^2
    Peak Dynamic = 1.07754 W
    Subthreshold Leakage = 1.61961 W
    Gate Leakage = 0.389994 W
    Runtime Dynamic = 1.07754 W
  Total MCs: 4 Memory Controllers 
  Device Type= ITRS high performance device type
    Area = 32.2777 mm^2
    Peak Dynamic = 5.92507 W
    Subthreshold Leakage = 0.559071 W
    Gate Leakage = 0.10416 W
    Runtime Dynamic = 7.93157 W
  Total NIUs: 2 Network Interface Units 
  Device Type= ITRS high performance device type
    Area = 15.8633 mm^2
    Peak Dynamic = 1.86482 W
    Subthreshold Leakage = 0.357626 W
    Gate Leakage = 0.183662 W
    Runtime Dynamic = 1.30537 W
  Total PCIes: 1 PCIe Controllers 
  Device Type= ITRS high performance device type
    Area = 6.24 mm^2
    Peak Dynamic = 1.5067 W
    Subthreshold Leakage = 0.214091 W
    Gate Leakage = 0.109948 W
    Runtime Dynamic = 1.05469 W
 *****************************************************************************************
 Core:
      Area = 14.5551 mm^2
      Peak Dynamic = 3.50346 W
      Subthreshold Leakage = 1.12503 W
      Gate Leakage = 0.241423 W
      Runtime Dynamic = 27.9237 W
      Instruction Fetch Unit:
        Area = 2.75911 mm^2
        Peak Dynamic = 0.817936 W
        Subthreshold Leakage = 0.0912466 W
        Gate Leakage = 0.0284483 W
        Runtime Dynamic = 4.81754 W
          Instruction Cache:
            Area = 2.51671 mm^2
            Peak Dynamic = 0.513783 W
            Subthreshold Leakage = 0.062355 W
            Gate Leakage = 0.0164185 W
            Runtime Dynamic = 1.59033 W
          Instruction Buffer:
            Area = 0.0130935 mm^2
            Peak Dynamic = 0.0100268 W
            Subthreshold Leakage = 0.000434992 W
            Gate Leakage = 6.02581e-05 W
            Runtime Dynamic = 0.160429 W
          Instruction Decoder:
            Area = 0.0119193 mm^2
            Peak Dynamic = 0.0892213 W
            Subthreshold Leakage = 0.00298091 W
            Gate Leakage = 0.000408973 W
            Runtime Dynamic = 1.42754 W
      Load Store Unit:
        Area = 2.14252 mm^2
        Peak Dynamic = 0.487978 W
        Subthreshold Leakage = 0.0802768 W
        Gate Leakage = 0.0247378 W
        Runtime Dynamic = 10.9331 W
          Data Cache:
            Area = 0.52868 mm^2
            Peak Dynamic = 0.0991646 W
            Subthreshold Leakage = 0.0119043 W
            Gate Leakage = 0.00145618 W
            Runtime Dynamic = 0.1303 W
          Load/Store Queue:
            Area = 1.22144 mm^2
            Peak Dynamic = 0.286361 W
            Subthreshold Leakage = 0.0428969 W
            Gate Leakage = 0.011721 W
            Runtime Dynamic = 9.16355 W
      Memory Management Unit:
        Area = 1.1006 mm^2
        Peak Dynamic = 0.399121 W
        Subthreshold Leakage = 0.0527367 W
        Gate Leakage = 0.0195353 W
        Runtime Dynamic = 2.78316 W
          Itlb:
            Area = 0.293144 mm^2
            Peak Dynamic = 0.0743045 W
            Subthreshold Leakage = 0.00720086 W
            Gate Leakage = 0.00218791 W
            Runtime Dynamic = 0.594438 W
          Dtlb:
            Area = 0.590071 mm^2
            Peak Dynamic = 0.0686851 W
            Subthreshold Leakage = 0.0200602 W
            Gate Leakage = 0.00578676 W
            Runtime Dynamic = 0.549486 W
      Execution Unit:
        Area = 6.79584 mm^2
        Peak Dynamic = 1.79843 W
        Subthreshold Leakage = 0.610924 W
        Gate Leakage = 0.116437 W
        Runtime Dynamic = 9.38994 W
          Register Files:
            Area = 1.18037 mm^2
            Peak Dynamic = 0.0639548 W
            Subthreshold Leakage = 0.00981018 W
            Gate Leakage = 0.00106415 W
            Runtime Dynamic = 0.401933 W
              Integer RF:
                Area = 0.648931 mm^2
                Peak Dynamic = 0.0485174 W
                Subthreshold Leakage = 0.00196627 W
                Gate Leakage = 0.000259389 W
                Runtime Dynamic = 0.392074 W
              Floating Point RF:
                Area = 0.324465 mm^2
                Peak Dynamic = 0.0154374 W
                Subthreshold Leakage = 0.00196627 W
                Gate Leakage = 0.000259389 W
                Runtime Dynamic = 0.0098154 W
              Register Windows:
                Area = 0.206972 mm^2
                Peak Dynamic = 0 W
                Subthreshold Leakage = 0.00587765 W
                Gate Leakage = 0.000545372 W
                Runtime Dynamic = 4.40062e-05 W
          Instruction Scheduler:
            Area = 0.0458096 mm^2
            Peak Dynamic = 0.0333897 W
            Subthreshold Leakage = 0.000402487 W
            Gate Leakage = 8.61395e-05 W
            Runtime Dynamic = 0.287483 W
              Instruction Window:
                Area = 0.0458096 mm^2
                Peak Dynamic = 0.0333897 W
                Subthreshold Leakage = 0.000402487 W
                Gate Leakage = 8.61395e-05 W
                Runtime Dynamic = 0.287483 W
          Integer ALUs (Count: 2 ):
            Area = 0.448448 mm^2
            Peak Dynamic = 0.425547 W
            Subthreshold Leakage = 0.147955 W
            Gate Leakage = 0.0266792 W
            Runtime Dynamic = 3.78264 W
          Floating Point Units (FPUs) (Count: 1 ):
            Area = 4.85979 mm^2
            Peak Dynamic = 0.425547 W
            Subthreshold Leakage = 0.400843 W
            Gate Leakage = 0.07228 W
            Runtime Dynamic = 0.0709246 W
          Results Broadcast Bus:
            Area Overhead = 0.0440413 mm^2
            Peak Dynamic = 0.481158 W
            Subthreshold Leakage = 0.0264373 W
            Gate Leakage = 0.00476717 W
            Runtime Dynamic = 3.20772 W
 *****************************************************************************************
 L2
      Area = 10.6299 mm^2
      Peak Dynamic = 1.23435 W
      Subthreshold Leakage = 0.338985 W
      Gate Leakage = 0.0855405 W
      Runtime Dynamic = 3.97632 W
 *****************************************************************************************
 First Level Directory
      Area = 1.45521 mm^2
      Peak Dynamic = 0.665462 W
      Subthreshold Leakage = 0.0312356 W
      Gate Leakage = 0.0134358 W
      Runtime Dynamic = 5.38275 W
 *****************************************************************************************
 Memory Controller:
      Area = 8.06942 mm^2
      Peak Dynamic = 1.48127 W
      Subthreshold Leakage = 0.139768 W
      Gate Leakage = 0.0260401 W
      Runtime Dynamic = 7.93157 W
      Front End Engine:
        Area = 0.250458 mm^2
        Peak Dynamic = 0.05883 W
        Subthreshold Leakage = 0.0029079 W
        Gate Leakage = 0.000455875 W
        Runtime Dynamic = 0.298069 W
      Transaction Engine:
        Area = 2.66058 mm^2
        Peak Dynamic = 0.6912 W
        Subthreshold Leakage = 0.0465697 W
        Gate Leakage = 0.00870562 W
        Runtime Dynamic = 3.50205 W
      PHY:
        Area = 5.15838 mm^2
        Peak Dynamic = 0.731237 W
        Subthreshold Leakage = 0.0902901 W
        Gate Leakage = 0.0168786 W
        Runtime Dynamic = 4.13145 W
 *****************************************************************************************
 NIU:
      Area = 7.93167 mm^2
      Peak Dynamic = 0.93241 W
      Subthreshold Leakage = 0.178813 W
      Gate Leakage = 0.0918312 W
      Runtime Dynamic = 0.652687 W
 *****************************************************************************************
 PCIe:
      Area = 6.24 mm^2
      Peak Dynamic = 1.5067 W
      Subthreshold Leakage = 0.214091 W
      Gate Leakage = 0.109948 W
      Runtime Dynamic = 1.05469 W
 *****************************************************************************************
 NOC
      Area = 9.56584 mm^2
      Peak Dynamic = 1.07754 W
      Subthreshold Leakage = 1.61961 W
      Gate Leakage = 0.389994 W
      Runtime Dynamic = 1.07754 W
      Router: 
        Area = 4.78292 mm^2
        Peak Dynamic = 0.538772 W
        Subthreshold Leakage = 0.809805 W
        Gate Leakage = 0.194997 W
        Runtime Dynamic = 1.07754 W
            Virtual Channel Buffer:
              Area = 0.827721 mm^2
              Peak Dynamic = 0.0223838 W
              Subthreshold Leakage = 0.00314985 W
              Gate Leakage = 0.000413272 W
              Runtime Dynamic = 0.0447677 W
            Crossbar:
              Area = 1.69589 mm^2
              Peak Dynamic = 0.511174 W
              Subthreshold Leakage = 0.806641 W
              Gate Leakage = 0.194581 W
              Runtime Dynamic = 1.02235 W
            Arbiter:
              Peak Dynamic = 0.00521447 W
              Subthreshold Leakage = 1.42757e-05 W
              Gate Leakage = 2.78294e-06 W
              Runtime Dynamic = 0.0104289 W
 *****************************************************************************************
--- a/ext/mcpat/results/Xeon_core
+++ b/ext/mcpat/results/Xeon_core
@ -0,0 +1,341 @@
 McPAT (version 0.7 of May, 2010) is computing the target processor...
 McPAT (version 0.7 of May, 2010) results  (current print level is 5)
 *****************************************************************************************
  Technology 65 nm
  Using Long Channel Devices When Appropriate
  Interconnect metal projection= aggressive interconnect technology projection
  Core clock Rate(MHz) 3400
 *****************************************************************************************
 Processor: 
  Area = 417.445 mm^2
  Peak Power = 142.148 W
  Total Leakage = 55.8021 W
  Peak Dynamic = 86.3458 W
  Subthreshold Leakage = 52.785 W
  Gate Leakage = 3.01712 W
  Runtime Dynamic = 63.1851 W
  Total Cores: 
  Device Type= ITRS high performance device type
    Area = 133.278 mm^2
    Peak Dynamic = 63.8414 W
    Subthreshold Leakage = 32.4393 W
    Gate Leakage = 2.72517 W
    Runtime Dynamic = 41.616 W
  Total L3s: 
  Device Type= ITRS high performance device type
    Area = 278.612 mm^2
    Peak Dynamic = 6.11346 W
    Subthreshold Leakage = 20.1995 W
    Gate Leakage = 0.267752 W
    Runtime Dynamic = 5.1782 W
  Total NoCs (Network/Bus): 
  Device Type= ITRS high performance device type
    Area = 5.5548 mm^2
    Peak Dynamic = 16.3909 W
    Subthreshold Leakage = 0.146229 W
    Gate Leakage = 0.0241913 W
    Runtime Dynamic = 16.3909 W
 *****************************************************************************************
 Core:
      Area = 66.6389 mm^2
      Peak Dynamic = 31.9207 W
      Subthreshold Leakage = 16.2197 W
      Gate Leakage = 1.36259 W
      Runtime Dynamic = 41.616 W
      Instruction Fetch Unit:
        Area = 7.41271 mm^2
        Peak Dynamic = 5.04492 W
        Subthreshold Leakage = 1.26751 W
        Gate Leakage = 0.09429 W
        Runtime Dynamic = 5.39803 W
          Instruction Cache:
            Area = 2.44324 mm^2
            Peak Dynamic = 1.42048 W
            Subthreshold Leakage = 0.359444 W
            Gate Leakage = 0.0187045 W
            Runtime Dynamic = 2.13804 W
          Branch Target Buffer:
            Area = 0.729086 mm^2
            Peak Dynamic = 0.161698 W
            Subthreshold Leakage = 0.0616324 W
            Gate Leakage = 0.00336254 W
            Runtime Dynamic = 0.646794 W
          Branch Predictor:
            Area = 0.430961 mm^2
            Peak Dynamic = 0.188469 W
            Subthreshold Leakage = 0.0698834 W
            Gate Leakage = 0.00415943 W
            Runtime Dynamic = 0.166045 W
              Global Predictor:
                Area = 0.174771 mm^2
                Peak Dynamic = 0.0633335 W
                Subthreshold Leakage = 0.0274086 W
                Gate Leakage = 0.00158249 W
                Runtime Dynamic = 0.0633335 W
              Local Predictor:
                Area = 0.0735854 mm^2
                Peak Dynamic = 0.0393754 W
                Subthreshold Leakage = 0.0111166 W
                Gate Leakage = 0.000721196 W
                Runtime Dynamic = 0.0393754 W
                Area = 0.0507308 mm^2
                Peak Dynamic = 0.0258383 W
                Subthreshold Leakage = 0.00749994 W
                Gate Leakage = 0.000498805 W
                Runtime Dynamic = 0.0258383 W
              Chooser:
                Area = 0.174771 mm^2
                Peak Dynamic = 0.0633335 W
                Subthreshold Leakage = 0.0274086 W
                Gate Leakage = 0.00158249 W
                Runtime Dynamic = 0.0633335 W
              RAS:
                Area = 0.0613744 mm^2
                Peak Dynamic = 0.0224266 W
                Subthreshold Leakage = 0.00394955 W
                Gate Leakage = 0.000273252 W
                Runtime Dynamic = 2.51602e-06 W
          Instruction Buffer:
            Area = 0.0684348 mm^2
            Peak Dynamic = 0.704461 W
            Subthreshold Leakage = 0.00411741 W
            Gate Leakage = 0.000240288 W
            Runtime Dynamic = 0.46964 W
          Instruction Decoder:
            Area = 3.73007 mm^2
            Peak Dynamic = 1.97751 W
            Subthreshold Leakage = 0.733056 W
            Gate Leakage = 0.0575912 W
            Runtime Dynamic = 1.97751 W
      Renaming Unit:
        Area = 1.82421 mm^2
        Peak Dynamic = 2.76284 W
        Subthreshold Leakage = 0.0765654 W
        Gate Leakage = 0.0125478 W
        Runtime Dynamic = 1.94438 W
          Int Front End RAT:
            Area = 0.875874 mm^2
            Peak Dynamic = 1.249 W
            Subthreshold Leakage = 0.0113878 W
            Gate Leakage = 0.000693471 W
            Runtime Dynamic = 1.249 W
          FP Front End RAT:
            Area = 0.405459 mm^2
            Peak Dynamic = 0.610062 W
            Subthreshold Leakage = 0.0144803 W
            Gate Leakage = 0.000906674 W
            Runtime Dynamic = 0.305031 W
          Free List:
            Area = 0.297629 mm^2
            Peak Dynamic = 0.137664 W
            Subthreshold Leakage = 0.0054316 W
            Gate Leakage = 0.000326171 W
            Runtime Dynamic = 0.275328 W
          Int Retire RAT: 
            Area = 0.0530903 mm^2
            Peak Dynamic = 0.056222 W
            Subthreshold Leakage = 0.00135314 W
            Gate Leakage = 0.00011607 W
            Runtime Dynamic = 0.056222 W
          FP Retire RAT:
            Area = 0.018828 mm^2
            Peak Dynamic = 0.0186388 W
            Subthreshold Leakage = 0.000788229 W
            Gate Leakage = 6.41952e-05 W
            Runtime Dynamic = 0.00931941 W
          FP Free List:
            Area = 0.162422 mm^2
            Peak Dynamic = 0.0989385 W
            Subthreshold Leakage = 0.00375181 W
            Gate Leakage = 0.000209083 W
            Runtime Dynamic = 0.0494693 W
      Load Store Unit:
        Area = 4.35998 mm^2
        Peak Dynamic = 2.94939 W
        Subthreshold Leakage = 0.208781 W
        Gate Leakage = 0.0232213 W
        Runtime Dynamic = 3.60184 W
          Data Cache:
            Area = 2.2051 mm^2
            Peak Dynamic = 1.08067 W
            Subthreshold Leakage = 0.0877157 W
            Gate Leakage = 0.00573003 W
            Runtime Dynamic = 2.30478 W
          LoadQ:
            Area = 0.637121 mm^2
            Peak Dynamic = 0.551016 W
            Subthreshold Leakage = 0.0283256 W
            Gate Leakage = 0.00254841 W
            Runtime Dynamic = 0.275508 W
          StoreQ:
            Area = 0.809965 mm^2
            Peak Dynamic = 1.02155 W
            Subthreshold Leakage = 0.053367 W
            Gate Leakage = 0.00471074 W
            Runtime Dynamic = 1.02155 W
      Memory Management Unit:
        Area = 0.517456 mm^2
        Peak Dynamic = 0.979218 W
        Subthreshold Leakage = 0.0808171 W
        Gate Leakage = 0.0139952 W
        Runtime Dynamic = 1.66678 W
          Itlb:
            Area = 0.127123 mm^2
            Peak Dynamic = 0.236587 W
            Subthreshold Leakage = 0.0160962 W
            Gate Leakage = 0.00146431 W
            Runtime Dynamic = 0.473177 W
          Dtlb:
            Area = 0.379422 mm^2
            Peak Dynamic = 0.298399 W
            Subthreshold Leakage = 0.0253484 W
            Gate Leakage = 0.00229878 W
            Runtime Dynamic = 1.1936 W
      Execution Unit:
        Area = 27.5381 mm^2
        Peak Dynamic = 16.9637 W
        Subthreshold Leakage = 7.08185 W
        Gate Leakage = 0.73316 W
        Runtime Dynamic = 22.7198 W
          Register Files:
            Area = 11.2548 mm^2
            Peak Dynamic = 3.2925 W
            Subthreshold Leakage = 0.11111 W
            Gate Leakage = 0.00754256 W
            Runtime Dynamic = 1.69823 W
              Integer RF:
                Area = 7.55916 mm^2
                Peak Dynamic = 2.82012 W
                Subthreshold Leakage = 0.0664048 W
                Gate Leakage = 0.00458288 W
                Runtime Dynamic = 1.51078 W
              Floating Point RF:
                Area = 3.69565 mm^2
                Peak Dynamic = 0.472385 W
                Subthreshold Leakage = 0.0447053 W
                Gate Leakage = 0.00295968 W
                Runtime Dynamic = 0.187454 W
          Instruction Scheduler:
            Area = 2.08681 mm^2
            Peak Dynamic = 2.1684 W
            Subthreshold Leakage = 0.0325294 W
            Gate Leakage = 0.00296372 W
            Runtime Dynamic = 2.59089 W
              Instruction Window:
                Area = 0.287309 mm^2
                Peak Dynamic = 0.929972 W
                Subthreshold Leakage = 0.0127376 W
                Gate Leakage = 0.00137073 W
                Runtime Dynamic = 1.2089 W
              FP Instruction Window:
                Area = 0.128977 mm^2
                Peak Dynamic = 0.478661 W
                Subthreshold Leakage = 0.00802287 W
                Gate Leakage = 0.000873414 W
                Runtime Dynamic = 0.622222 W
              ROB:
                Area = 1.67052 mm^2
                Peak Dynamic = 0.759764 W
                Subthreshold Leakage = 0.0117689 W
                Gate Leakage = 0.000719579 W
                Runtime Dynamic = 0.759764 W
          Integer ALUs (Count: 6 ):
            Area = 4.03603 mm^2
            Peak Dynamic = 4.55818 W
            Subthreshold Leakage = 3.9898 W
            Gate Leakage = 0.412015 W
            Runtime Dynamic = 2.33394 W
          Floating Point Units (FPUs) (Count: 2 ):
            Area = 9.71959 mm^2
            Peak Dynamic = 1.43327 W
            Subthreshold Leakage = 2.40207 W
            Gate Leakage = 0.248054 W
            Runtime Dynamic = 2.55333 W
          Complex ALUs (Mul/Div) (Count: 1 ):
            Area = 0.336336 mm^2
            Peak Dynamic = 0.510666 W
            Subthreshold Leakage = 0.332484 W
            Gate Leakage = 0.0343346 W
            Runtime Dynamic = 3.18505 W
          Results Broadcast Bus:
            Area Overhead = 0.0936618 mm^2
            Peak Dynamic = 4.4084 W
            Subthreshold Leakage = 0.174486 W
            Gate Leakage = 0.0180186 W
            Runtime Dynamic = 10.3584 W
    L2
    Area = 15.914 mm^2
    Peak Dynamic = 3.22061 W
    Subthreshold Leakage = 3.01991 W
    Gate Leakage = 0.0223008 W
    Runtime Dynamic = 6.28514 W
 *****************************************************************************************
      L3
      Area = 278.612 mm^2
      Peak Dynamic = 6.11346 W
      Subthreshold Leakage = 20.1995 W
      Gate Leakage = 0.267752 W
      Runtime Dynamic = 5.1782 W
 *****************************************************************************************
 BUSES
      Area = 5.5548 mm^2
      Peak Dynamic = 16.3909 W
      Subthreshold Leakage = 0.146229 W
      Gate Leakage = 0.0241913 W
      Runtime Dynamic = 16.3909 W
      Bus: 
        Area = 5.5548 mm^2
        Peak Dynamic = 16.3909 W
        Subthreshold Leakage = 0.146229 W
        Gate Leakage = 0.0241913 W
        Runtime Dynamic = 16.3909 W
 *****************************************************************************************
--- a/ext/mcpat/results/Xeon_uncore
+++ b/ext/mcpat/results/Xeon_uncore
@ -0,0 +1,341 @@
 McPAT (version 0.7 of May, 2010) is computing the target processor...
 McPAT (version 0.7 of May, 2010) results  (current print level is 5)
 *****************************************************************************************
  Technology 65 nm
  Using Long Channel Devices When Appropriate
  Interconnect metal projection= aggressive interconnect technology projection
  Core clock Rate(MHz) 3400
 *****************************************************************************************
 Processor: 
  Area = 418.629 mm^2
  Peak Power = 96.2032 W
  Total Leakage = 27.5568 W
  Peak Dynamic = 68.6463 W
  Subthreshold Leakage = 25.8287 W
  Gate Leakage = 1.72809 W
  Runtime Dynamic = 50.332 W
  Total Cores: 
  Device Type= ITRS high performance device type
    Area = 134.217 mm^2
    Peak Dynamic = 50.8677 W
    Subthreshold Leakage = 15.0187 W
    Gate Leakage = 1.57092 W
    Runtime Dynamic = 33.3003 W
  Total L3s: 
  Device Type= ITRS high performance device type
    Area = 278.843 mm^2
    Peak Dynamic = 4.84476 W
    Subthreshold Leakage = 10.7416 W
    Gate Leakage = 0.144361 W
    Runtime Dynamic = 4.09781 W
  Total NoCs (Network/Bus): 
  Device Type= ITRS high performance device type
    Area = 5.56828 mm^2
    Peak Dynamic = 12.9339 W
    Subthreshold Leakage = 0.0684953 W
    Gate Leakage = 0.0128043 W
    Runtime Dynamic = 12.9339 W
 *****************************************************************************************
 Core:
      Area = 67.1085 mm^2
      Peak Dynamic = 25.4338 W
      Subthreshold Leakage = 7.50933 W
      Gate Leakage = 0.78546 W
      Runtime Dynamic = 33.3003 W
      Instruction Fetch Unit:
        Area = 7.56843 mm^2
        Peak Dynamic = 4.27305 W
        Subthreshold Leakage = 0.571346 W
        Gate Leakage = 0.0523885 W
        Runtime Dynamic = 4.67953 W
          Instruction Cache:
            Area = 2.44678 mm^2
            Peak Dynamic = 1.1785 W
            Subthreshold Leakage = 0.151766 W
            Gate Leakage = 0.009764 W
            Runtime Dynamic = 1.7926 W
          Branch Target Buffer:
            Area = 0.718635 mm^2
            Peak Dynamic = 0.151619 W
            Subthreshold Leakage = 0.0238082 W
            Gate Leakage = 0.0015503 W
            Runtime Dynamic = 0.606475 W
          Branch Predictor:
            Area = 0.446844 mm^2
            Peak Dynamic = 0.158508 W
            Subthreshold Leakage = 0.0293041 W
            Gate Leakage = 0.0021362 W
            Runtime Dynamic = 0.14087 W
              Global Predictor:
                Area = 0.174801 mm^2
                Peak Dynamic = 0.0543932 W
                Subthreshold Leakage = 0.0116121 W
                Gate Leakage = 0.000827171 W
                Runtime Dynamic = 0.0543932 W
              Local Predictor:
                Area = 0.0788692 mm^2
                Peak Dynamic = 0.0320817 W
                Subthreshold Leakage = 0.00452837 W
                Gate Leakage = 0.000354718 W
                Runtime Dynamic = 0.0320817 W
                Area = 0.050748 mm^2
                Peak Dynamic = 0.0218669 W
                Subthreshold Leakage = 0.00318852 W
                Gate Leakage = 0.000264126 W
                Runtime Dynamic = 0.0218669 W
              Chooser:
                Area = 0.174801 mm^2
                Peak Dynamic = 0.0543932 W
                Subthreshold Leakage = 0.0116121 W
                Gate Leakage = 0.000827171 W
                Runtime Dynamic = 0.0543932 W
              RAS:
                Area = 0.0929863 mm^2
                Peak Dynamic = 0.0176394 W
                Subthreshold Leakage = 0.00155163 W
                Gate Leakage = 0.00012714 W
                Runtime Dynamic = 1.96119e-06 W
          Instruction Buffer:
            Area = 0.0687233 mm^2
            Peak Dynamic = 0.579633 W
            Subthreshold Leakage = 0.00177049 W
            Gate Leakage = 0.000129185 W
            Runtime Dynamic = 0.386422 W
          Instruction Decoder:
            Area = 3.87654 mm^2
            Peak Dynamic = 1.75316 W
            Subthreshold Leakage = 0.348225 W
            Gate Leakage = 0.0335628 W
            Runtime Dynamic = 1.75316 W
      Renaming Unit:
        Area = 1.83366 mm^2
        Peak Dynamic = 2.16025 W
        Subthreshold Leakage = 0.0324638 W
        Gate Leakage = 0.00648876 W
        Runtime Dynamic = 1.53428 W
          Int Front End RAT:
            Area = 0.879521 mm^2
            Peak Dynamic = 0.975897 W
            Subthreshold Leakage = 0.00490782 W
            Gate Leakage = 0.000372282 W
            Runtime Dynamic = 0.975897 W
          FP Front End RAT:
            Area = 0.407642 mm^2
            Peak Dynamic = 0.477469 W
            Subthreshold Leakage = 0.00619591 W
            Gate Leakage = 0.000483134 W
            Runtime Dynamic = 0.238735 W
          Free List:
            Area = 0.300513 mm^2
            Peak Dynamic = 0.112906 W
            Subthreshold Leakage = 0.00233243 W
            Gate Leakage = 0.000174984 W
            Runtime Dynamic = 0.225813 W
          Int Retire RAT: 
            Area = 0.0534147 mm^2
            Peak Dynamic = 0.0453154 W
            Subthreshold Leakage = 0.00058142 W
            Gate Leakage = 6.26682e-05 W
            Runtime Dynamic = 0.0453154 W
          FP Retire RAT:
            Area = 0.018897 mm^2
            Peak Dynamic = 0.0151716 W
            Subthreshold Leakage = 0.000337803 W
            Gate Leakage = 3.45545e-05 W
            Runtime Dynamic = 0.00758578 W
          FP Free List:
            Area = 0.162758 mm^2
            Peak Dynamic = 0.081858 W
            Subthreshold Leakage = 0.00163685 W
            Gate Leakage = 0.000115075 W
            Runtime Dynamic = 0.040929 W
      Load Store Unit:
        Area = 4.4281 mm^2
        Peak Dynamic = 2.34722 W
        Subthreshold Leakage = 0.0896936 W
        Gate Leakage = 0.0121845 W
        Runtime Dynamic = 2.89901 W
          Data Cache:
            Area = 2.25853 mm^2
            Peak Dynamic = 0.888323 W
            Subthreshold Leakage = 0.0382167 W
            Gate Leakage = 0.00311455 W
            Runtime Dynamic = 1.88387 W
          LoadQ:
            Area = 0.638298 mm^2
            Peak Dynamic = 0.435889 W
            Subthreshold Leakage = 0.0121526 W
            Gate Leakage = 0.00134375 W
            Runtime Dynamic = 0.217944 W
          StoreQ:
            Area = 0.811765 mm^2
            Peak Dynamic = 0.79719 W
            Subthreshold Leakage = 0.0228527 W
            Gate Leakage = 0.00248017 W
            Runtime Dynamic = 0.79719 W
      Memory Management Unit:
        Area = 0.518866 mm^2
        Peak Dynamic = 0.760463 W
        Subthreshold Leakage = 0.0342246 W
        Gate Leakage = 0.00722713 W
        Runtime Dynamic = 1.31193 W
          Itlb:
            Area = 0.12744 mm^2
            Peak Dynamic = 0.187517 W
            Subthreshold Leakage = 0.00686539 W
            Gate Leakage = 0.000767441 W
            Runtime Dynamic = 0.375037 W
          Dtlb:
            Area = 0.380515 mm^2
            Peak Dynamic = 0.234221 W
            Subthreshold Leakage = 0.0108877 W
            Gate Leakage = 0.00121362 W
            Runtime Dynamic = 0.936886 W
      Execution Unit:
        Area = 27.5564 mm^2
        Peak Dynamic = 13.34 W
        Subthreshold Leakage = 3.35055 W
        Gate Leakage = 0.425 W
        Runtime Dynamic = 17.8618 W
          Register Files:
            Area = 11.2668 mm^2
            Peak Dynamic = 2.65925 W
            Subthreshold Leakage = 0.0472795 W
            Gate Leakage = 0.00398463 W
            Runtime Dynamic = 1.37147 W
              Integer RF:
                Area = 7.56635 mm^2
                Peak Dynamic = 2.27672 W
                Subthreshold Leakage = 0.0282472 W
                Gate Leakage = 0.00241709 W
                Runtime Dynamic = 1.21967 W
              Floating Point RF:
                Area = 3.70048 mm^2
                Peak Dynamic = 0.382527 W
                Subthreshold Leakage = 0.0190323 W
                Gate Leakage = 0.00156754 W
                Runtime Dynamic = 0.151797 W
          Instruction Scheduler:
            Area = 2.09118 mm^2
            Peak Dynamic = 1.7092 W
            Subthreshold Leakage = 0.0139125 W
            Gate Leakage = 0.00156067 W
            Runtime Dynamic = 2.04197 W
              Instruction Window:
                Area = 0.287606 mm^2
                Peak Dynamic = 0.721714 W
                Subthreshold Leakage = 0.00547415 W
                Gate Leakage = 0.000721338 W
                Runtime Dynamic = 0.940723 W
              FP Instruction Window:
                Area = 0.129287 mm^2
                Peak Dynamic = 0.372875 W
                Subthreshold Leakage = 0.0034355 W
                Gate Leakage = 0.00045775 W
                Runtime Dynamic = 0.486639 W
              ROB:
                Area = 1.67428 mm^2
                Peak Dynamic = 0.61461 W
                Subthreshold Leakage = 0.00500288 W
                Gate Leakage = 0.00038158 W
                Runtime Dynamic = 0.61461 W
          Integer ALUs (Count: 6 ):
            Area = 4.03603 mm^2
            Peak Dynamic = 3.52986 W
            Subthreshold Leakage = 1.89726 W
            Gate Leakage = 0.240113 W
            Runtime Dynamic = 1.8074 W
          Floating Point Units (FPUs) (Count: 2 ):
            Area = 9.71959 mm^2
            Peak Dynamic = 1.10993 W
            Subthreshold Leakage = 1.14225 W
            Gate Leakage = 0.14456 W
            Runtime Dynamic = 1.9773 W
          Complex ALUs (Mul/Div) (Count: 1 ):
            Area = 0.336336 mm^2
            Peak Dynamic = 0.405148 W
            Subthreshold Leakage = 0.158105 W
            Gate Leakage = 0.0200094 W
            Runtime Dynamic = 2.4988 W
          Results Broadcast Bus:
            Area Overhead = 0.0954831 mm^2
            Peak Dynamic = 3.47499 W
            Subthreshold Leakage = 0.0752739 W
            Gate Leakage = 0.00952648 W
            Runtime Dynamic = 8.1649 W
    L2
    Area = 16.1307 mm^2
    Peak Dynamic = 2.55285 W
    Subthreshold Leakage = 1.29868 W
    Gate Leakage = 0.012304 W
    Runtime Dynamic = 5.01368 W
 *****************************************************************************************
      L3
      Area = 278.843 mm^2
      Peak Dynamic = 4.84476 W
      Subthreshold Leakage = 10.7416 W
      Gate Leakage = 0.144361 W
      Runtime Dynamic = 4.09781 W
 *****************************************************************************************
 BUSES
      Area = 5.56828 mm^2
      Peak Dynamic = 12.9339 W
      Subthreshold Leakage = 0.0684953 W
      Gate Leakage = 0.0128043 W
      Runtime Dynamic = 12.9339 W
      Bus: 
        Area = 5.56828 mm^2
        Peak Dynamic = 12.9339 W
        Subthreshold Leakage = 0.0684953 W
        Gate Leakage = 0.0128043 W
        Runtime Dynamic = 12.9339 W
 *****************************************************************************************
--- a/ext/mcpat/sharedcache.cc
+++ b/ext/mcpat/sharedcache.cc
--- a/ext/mcpat/sharedcache.h
+++ b/ext/mcpat/sharedcache.h
@ -0,0 +1,89 @@
 /*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
 *
 ***************************************************************************/
 #ifndef SHAREDCACHE_H_
 #define SHAREDCACHE_H_
 #include <vector>
 #include "XML_Parse.h"
 #include "area.h"
 #include "array.h"
 #include "basic_components.h"
 #include "logic.h"
 #include "parameter.h"
 class SharedCache :public Component{
  public:
    ParseXML * XML;
    int ithCache;
        InputParameter interface_ip;
        enum cache_level cacheL;
    DataCache unicache;//Shared cache
    CacheDynParam cachep;
    statsDef   homenode_tdp_stats;
    statsDef   homenode_rtp_stats;
    statsDef   homenode_stats_t;
    double	   dir_overhead;
    //	cache_processor llCache,directory, directory1, inv_dir;
    //pipeline pipeLogicCache, pipeLogicDirectory;
    //clock_network				clockNetwork;
    double scktRatio, executionTime;
    //   Component L2Tot, cc, cc1, ccTot;
    SharedCache(ParseXML *XML_interface, int ithCache_, InputParameter* interface_ip_,enum cache_level cacheL_ =L2);
    void set_cache_param();
        void computeEnergy(bool is_tdp=true);
    void displayEnergy(uint32_t indent = 0,bool is_tdp=true);
    ~SharedCache(){};
 };
 class CCdir :public Component{
  public:
    ParseXML * XML;
    int ithCache;
        InputParameter interface_ip;
    DataCache dc;//Shared cache
    ArrayST * shadow_dir;
 //	cache_processor llCache,directory, directory1, inv_dir;
    //pipeline pipeLogicCache, pipeLogicDirectory;
    //clock_network				clockNetwork;
    double scktRatio, clockRate, executionTime;
    Component L2Tot, cc, cc1, ccTot;
    CCdir(ParseXML *XML_interface, int ithCache_, InputParameter* interface_ip_);
    void computeEnergy(bool is_tdp=true);
    void displayEnergy(uint32_t indent = 0,bool is_tdp=true);
    ~CCdir();
 };
 #endif /* SHAREDCACHE_H_ */
--- a/Show more
+++ b/Show more