gem5/ext/mcpat/logic.cc
Yasuko Eckert 0deef376d9 ext: McPAT interface changes and fixes
This patch includes software engineering changes and some generic bug fixes
Joel Hestness and Yasuko Eckert made to McPAT 0.8. There are still known
issues/concernts we did not have a chance to address in this patch.

High-level changes in this patch include:
 1) Making XML parsing modular and hierarchical:
   - Shift parsing responsibility into the components
   - Read XML in a (mostly) context-free recursive manner so that McPAT input
     files can contain arbitrary component hierarchies
 2) Making power, energy, and area calculations a hierarchical and recursive
    process
   - Components track their subcomponents and recursively call compute
     functions in stages
   - Make C++ object hierarchy reflect inheritance of classes of components
     with similar structures
   - Simplify computeArea() and computeEnergy() functions to eliminate
     successive calls to calculate separate TDP vs. runtime energy
   - Remove Processor component (now unnecessary) and introduce a more abstract
     System component
 3) Standardizing McPAT output across all components
   - Use a single, common data structure for storing and printing McPAT output
   - Recursively call print functions through component hierarchy
 4) For caches, allow splitting data array and tag array reads and writes for
    better accuracy
 5) Improving the usability of CACTI by printing more helpful warning and error
    messages
 6) Minor: Impose more rigorous code style for clarity (more work still to be
    done)
Overall, these changes greatly reduce the amount of replicated code, and they
improve McPAT runtime and decrease memory footprint.
2014-06-03 13:32:59 -07:00

959 lines
45 KiB
C++

/*****************************************************************************
* McPAT
* SOFTWARE LICENSE AGREEMENT
* Copyright 2012 Hewlett-Packard Development Company, L.P.
* Copyright (c) 2010-2013 Advanced Micro Devices, Inc.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#include "common.h"
#include "logic.h"
//selection_logic
selection_logic::selection_logic(XMLNode* _xml_data, bool _is_default,
int _win_entries, int issue_width_,
const InputParameter *configure_interface,
string _name, double _accesses,
double clockRate_, enum Device_ty device_ty_,
enum Core_type core_ty_)
: McPATComponent(_xml_data), is_default(_is_default),
win_entries(_win_entries),
issue_width(issue_width_),
accesses(_accesses),
device_ty(device_ty_),
core_ty(core_ty_) {
clockRate = clockRate_;
name = _name;
l_ip = *configure_interface;
local_result = init_interface(&l_ip, name);
}
void selection_logic::computeArea() {
output_data.area = local_result.area;
}
void selection_logic::computeEnergy() {
//based on cost effective superscalar processor TR pp27-31
double Ctotal, Cor, Cpencode;
int num_arbiter;
double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp;
//the 0.8um process data is used.
//this was 10 micron for the 0.8 micron process
WSelORn = 12.5 * l_ip.F_sz_um;
//this was 40 micron for the 0.8 micron process
WSelORprequ = 50 * l_ip.F_sz_um;
//this was 10mcron for the 0.8 micron process
WSelPn = 12.5 * l_ip.F_sz_um;
//this was 15 micron for the 0.8 micron process
WSelPp = 18.75 * l_ip.F_sz_um;
//this was 5 micron for the 0.8 micron process
WSelEnn = 6.25 * l_ip.F_sz_um;
//this was 10 micron for the 0.8 micron process
WSelEnp = 12.5 * l_ip.F_sz_um;
Ctotal = 0;
num_arbiter = 1;
while (win_entries > 4) {
win_entries = (int)ceil((double)win_entries / 4.0);
num_arbiter += win_entries;
}
//the 4-input OR logic to generate anyreq
Cor = 4 * drain_C_(WSelORn, NCH, 1, 1, g_tp.cell_h_def) +
drain_C_(WSelORprequ, PCH, 1, 1, g_tp.cell_h_def);
power.readOp.gate_leakage =
cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor) * g_tp.peri_global.Vdd;
//The total capacity of the 4-bit priority encoder
Cpencode = drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
drain_C_(WSelPp, PCH, 1, 1, g_tp.cell_h_def) +
2 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
drain_C_(WSelPp, PCH, 2, 1, g_tp.cell_h_def) +
3 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
drain_C_(WSelPp, PCH, 3, 1, g_tp.cell_h_def) +
4 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
drain_C_(WSelPp, PCH, 4, 1, g_tp.cell_h_def) +//precompute priority logic
2 * 4 * gate_C(WSelEnn + WSelEnp, 20.0) +
4 * drain_C_(WSelEnn, NCH, 1, 1, g_tp.cell_h_def) +
2 * 4 * drain_C_(WSelEnp, PCH, 1, 1, g_tp.cell_h_def) +//enable logic
(2 * 4 + 2 * 3 + 2 * 2 + 2) *
gate_C(WSelPn + WSelPp, 10.0);//requests signal
Ctotal += issue_width * num_arbiter * (Cor + Cpencode);
//2 means the abitration signal need to travel round trip
power.readOp.dynamic =
Ctotal * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 2;
power.readOp.leakage = issue_width * num_arbiter *
(cmos_Isub_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
+ cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)//grant2p
+ cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)//grant3p
+ cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
+ cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant sIsubnals
) * g_tp.peri_global.Vdd;
power.readOp.gate_leakage = issue_width * num_arbiter *
(cmos_Ig_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
+ cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)//grant2p
+ cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)//grant3p
+ cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
+ cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant signals
) * g_tp.peri_global.Vdd;
double sckRation = g_tp.sckt_co_eff;
power.readOp.dynamic *= sckRation;
power.writeOp.dynamic *= sckRation;
power.searchOp.dynamic *= sckRation;
double long_channel_device_reduction =
longer_channel_device_reduction(device_ty, core_ty);
power.readOp.longer_channel_leakage =
power.readOp.leakage * long_channel_device_reduction;
output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
output_data.subthreshold_leakage_power = power.readOp.leakage;
output_data.gate_leakage_power = power.readOp.gate_leakage;
output_data.runtime_dynamic_energy = power.readOp.dynamic * accesses;
}
dep_resource_conflict_check::dep_resource_conflict_check(
XMLNode* _xml_data, const string _name,
const InputParameter *configure_interface,
const CoreParameters & dyn_p_, int compare_bits_,
double clockRate_, bool _is_default)
: McPATComponent(_xml_data), l_ip(*configure_interface),
coredynp(dyn_p_), compare_bits(compare_bits_), is_default(_is_default) {
name = _name;
clockRate = clockRate_;
//this was 20.0 micron for the 0.8 micron process
Wcompn = 25 * l_ip.F_sz_um;
//this was 20.0 micron for the 0.8 micron process
Wevalinvp = 25 * l_ip.F_sz_um;
//this was 80.0 mcron for the 0.8 micron process
Wevalinvn = 100 * l_ip.F_sz_um;
//this was 40.0 micron for the 0.8 micron process
Wcomppreequ = 50 * l_ip.F_sz_um;
//this was 5.4 micron for the 0.8 micron process
WNORn = 6.75 * l_ip.F_sz_um;
//this was 30.5 micron for the 0.8 micron process
WNORp = 38.125 * l_ip.F_sz_um;
// To make CACTI happy.
l_ip.cache_sz = MIN_BUFFER_SIZE;
local_result = init_interface(&l_ip, name);
if (coredynp.core_ty == Inorder)
//TODO: opcode bits + log(shared resources) + REG TAG BITS -->
//opcode comparator
compare_bits += 16 + 8 + 8;
else
compare_bits += 16 + 8 + 8;
conflict_check_power();
double sckRation = g_tp.sckt_co_eff;
power.readOp.dynamic *= sckRation;
power.writeOp.dynamic *= sckRation;
power.searchOp.dynamic *= sckRation;
}
void dep_resource_conflict_check::conflict_check_power() {
double Ctotal;
int num_comparators;
//2(N*N-N) is used for source to dest comparison, (N*N-N) is used for
//dest to dest comparision.
num_comparators = 3 * ((coredynp.decodeW) * (coredynp.decodeW) -
coredynp.decodeW);
Ctotal = num_comparators * compare_cap();
power.readOp.dynamic = Ctotal * /*CLOCKRATE*/ g_tp.peri_global.Vdd *
g_tp.peri_global.Vdd /*AF*/;
power.readOp.leakage = num_comparators * compare_bits * 2 *
simplified_nmos_leakage(Wcompn, false);
double long_channel_device_reduction =
longer_channel_device_reduction(Core_device, coredynp.core_ty);
power.readOp.longer_channel_leakage =
power.readOp.leakage * long_channel_device_reduction;
power.readOp.gate_leakage = num_comparators * compare_bits * 2 *
cmos_Ig_leakage(Wcompn, 0, 2, nmos);
}
/* estimate comparator power consumption (this comparator is similar
to the tag-match structure in a CAM */
double dep_resource_conflict_check::compare_cap() {
double c1, c2;
//resize the big NOR gate at the DCL according to fan in.
WNORp = WNORp * compare_bits / 2.0;
/* bottom part of comparator */
c2 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) +
drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def)) +
drain_C_(Wevalinvp, PCH, 1, 1, g_tp.cell_h_def) +
drain_C_(Wevalinvn, NCH, 1, 1, g_tp.cell_h_def);
/* top part of comparator */
c1 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) +
drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def) +
drain_C_(Wcomppreequ, NCH, 1, 1, g_tp.cell_h_def)) +
gate_C(WNORn + WNORp, 10.0) +
drain_C_(WNORp, NCH, 2, 1, g_tp.cell_h_def) + compare_bits *
drain_C_(WNORn, NCH, 2, 1, g_tp.cell_h_def);
return(c1 + c2);
}
void dep_resource_conflict_check::leakage_feedback(double temperature)
{
l_ip.temp = (unsigned int)round(temperature/10.0)*10;
uca_org_t init_result = init_interface(&l_ip, name); // init_result is dummy
// This is part of conflict_check_power()
// 2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest
// to dest comparison.
int num_comparators = 3 * ((coredynp.decodeW) * (coredynp.decodeW) -
coredynp.decodeW);
power.readOp.leakage = num_comparators * compare_bits * 2 *
simplified_nmos_leakage(Wcompn, false);
double long_channel_device_reduction =
longer_channel_device_reduction(Core_device, coredynp.core_ty);
power.readOp.longer_channel_leakage = power.readOp.leakage *
long_channel_device_reduction;
power.readOp.gate_leakage = num_comparators * compare_bits * 2 *
cmos_Ig_leakage(Wcompn, 0, 2, nmos);
}
DFFCell::DFFCell(
bool _is_dram,
double _WdecNANDn,
double _WdecNANDp,
double _cell_load,
const InputParameter *configure_interface)
: is_dram(_is_dram),
cell_load(_cell_load),
WdecNANDn(_WdecNANDn),
WdecNANDp(_WdecNANDp) { //this model is based on the NAND2 based DFF.
l_ip = *configure_interface;
area.set_area(5 * compute_gate_area(NAND, 2,WdecNANDn,WdecNANDp,
g_tp.cell_h_def)
+ compute_gate_area(NAND, 2,WdecNANDn,WdecNANDn,
g_tp.cell_h_def));
}
double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out) {
double Ctotal = 0;
/* part 1: drain cap of NAND gate */
Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram);
/* part 2: gate cap of NAND gates */
Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
return Ctotal;
}
void DFFCell::compute_DFF_cell() {
double c1, c2, c3, c4, c5, c6;
/* node 5 and node 6 are identical to node 1 in capacitance */
c1 = c5 = c6 = fpfp_node_cap(2, 1);
c2 = fpfp_node_cap(2, 3);
c3 = fpfp_node_cap(3, 2);
c4 = fpfp_node_cap(2, 2);
//cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2
clock_cap = 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2 * cell_load) *
0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;
/* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle */
e_keep_1.readOp.dynamic +=
c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
e_keep_0.readOp.dynamic +=
c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
e_clock.readOp.dynamic +=
clock_cap * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;
/* static power */
e_switch.readOp.leakage +=
(cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand) *
5//5 NAND2 and 1 NAND3 in a DFF
+ cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand)) *
g_tp.peri_global.Vdd;
e_switch.readOp.gate_leakage +=
(cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand) *
5//5 NAND2 and 1 NAND3 in a DFF
+ cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand)) *
g_tp.peri_global.Vdd;
}
Pipeline::Pipeline(XMLNode* _xml_data,
const InputParameter *configure_interface,
const CoreParameters & dyn_p_,
enum Device_ty device_ty_,
bool _is_core_pipeline,
bool _is_default)
: McPATComponent(_xml_data), l_ip(*configure_interface),
coredynp(dyn_p_), device_ty(device_ty_),
is_core_pipeline(_is_core_pipeline), is_default(_is_default),
num_piperegs(0.0) {
name = "Pipeline?";
local_result = init_interface(&l_ip, name);
if (!coredynp.Embedded) {
process_ind = true;
} else {
process_ind = false;
}
//this was 20 micron for the 0.8 micron process
WNANDn = (process_ind) ? 25 * l_ip.F_sz_um : g_tp.min_w_nmos_ ;
//this was 30 micron for the 0.8 micron process
WNANDp = (process_ind) ? 37.5 * l_ip.F_sz_um : g_tp.min_w_nmos_ *
pmos_to_nmos_sz_ratio();
load_per_pipeline_stage = 2 * gate_C(WNANDn + WNANDp, 0, false);
compute();
}
void Pipeline::compute() {
compute_stage_vector();
DFFCell pipe_reg(false, WNANDn, WNANDp, load_per_pipeline_stage, &l_ip);
pipe_reg.compute_DFF_cell();
double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic;
//******************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider
//the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power.
double pipe_reg_power = num_piperegs *
(pipe_reg.e_switch.readOp.dynamic + pipe_reg.e_keep_0.readOp.dynamic +
pipe_reg.e_keep_1.readOp.dynamic) / 3 + clock_power_pipereg;
double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage;
double pipe_reg_gate_leakage = num_piperegs *
pipe_reg.e_switch.readOp.gate_leakage;
power.readOp.dynamic += pipe_reg_power;
power.readOp.leakage += pipe_reg_leakage;
power.readOp.gate_leakage += pipe_reg_gate_leakage;
area.set_area(num_piperegs * pipe_reg.area.get_area());
double long_channel_device_reduction =
longer_channel_device_reduction(device_ty, coredynp.core_ty);
power.readOp.longer_channel_leakage = power.readOp.leakage *
long_channel_device_reduction;
double sckRation = g_tp.sckt_co_eff;
power.readOp.dynamic *= sckRation;
power.writeOp.dynamic *= sckRation;
power.searchOp.dynamic *= sckRation;
double macro_layout_overhead = g_tp.macro_layout_overhead;
if (!coredynp.Embedded)
area.set_area(area.get_area() * macro_layout_overhead);
output_data.area = area.get_area() / 1e6;
output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
output_data.subthreshold_leakage_power = power.readOp.leakage;
output_data.gate_leakage_power = power.readOp.gate_leakage;
output_data.runtime_dynamic_energy = power.readOp.dynamic * total_cycles;
}
void Pipeline::compute_stage_vector() {
double num_stages, tot_stage_vector, per_stage_vector;
int opcode_length = coredynp.x86 ?
coredynp.micro_opcode_length : coredynp.opcode_width;
if (!is_core_pipeline) {
//The number of pipeline stages are calculated based on the achievable
//throughput and required throughput
num_piperegs = l_ip.pipeline_stages * l_ip.per_stage_vector;
} else {
if (coredynp.core_ty == Inorder) {
/* assume 6 pipe stages and try to estimate bits per pipe stage */
/* pipe stage 0/IF */
num_piperegs += coredynp.pc_width * 2 * coredynp.num_hthreads;
/* pipe stage IF/ID */
num_piperegs += coredynp.fetchW *
(coredynp.instruction_length + coredynp.pc_width) *
coredynp.num_hthreads;
/* pipe stage IF/ThreadSEL */
if (coredynp.multithreaded) {
num_piperegs += coredynp.num_hthreads *
coredynp.perThreadState; //8 bit thread states
}
/* pipe stage ID/EXE */
num_piperegs += coredynp.decodeW *
(coredynp.instruction_length + coredynp.pc_width +
pow(2.0, opcode_length) + 2 * coredynp.int_data_width) *
coredynp.num_hthreads;
/* pipe stage EXE/MEM */
num_piperegs += coredynp.issueW *
(3 * coredynp.arch_ireg_width + pow(2.0, opcode_length) + 8 *
2 * coredynp.int_data_width/*+2*powers (2,reg_length)*/);
/* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode*/
num_piperegs += coredynp.issueW *
(2 * coredynp.int_data_width + pow(2.0, opcode_length) + 8 *
2 * coredynp.int_data_width/*+2*powers (2,reg_length)*/);
num_stages = 6;
} else {
/* assume 12 stage pipe stages and try to estimate bits per pipe stage */
/*OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM */
/* pipe stage 0/1F*/
num_piperegs +=
coredynp.pc_width * 2 * coredynp.num_hthreads ;//PC and Next PC
/* pipe stage IF/ID */
num_piperegs += coredynp.fetchW *
(coredynp.instruction_length + coredynp.pc_width) *
coredynp.num_hthreads;//PC is used to feed branch predictor in ID
/* pipe stage 1D/Renaming*/
num_piperegs += coredynp.decodeW *
(coredynp.instruction_length + coredynp.pc_width) *
coredynp.num_hthreads;//PC is for branch exe in later stage.
/* pipe stage Renaming/wire_drive */
num_piperegs += coredynp.decodeW *
(coredynp.instruction_length + coredynp.pc_width);
/* pipe stage Renaming/IssueQ */
//3*coredynp.phy_ireg_width means 2 sources and 1 dest
num_piperegs += coredynp.issueW *
(coredynp.instruction_length + coredynp.pc_width + 3 *
coredynp.phy_ireg_width) * coredynp.num_hthreads;
/* pipe stage IssueQ/Dispatch */
num_piperegs += coredynp.issueW *
(coredynp.instruction_length + 3 * coredynp.phy_ireg_width);
/* pipe stage Dispatch/EXE */
num_piperegs += coredynp.issueW *
(3 * coredynp.phy_ireg_width + coredynp.pc_width +
pow(2.0, opcode_length)/*+2*powers (2,reg_length)*/);
/* 2^opcode_length means the total decoded signal for the opcode*/
num_piperegs += coredynp.issueW *
(2 * coredynp.int_data_width + pow(2.0, opcode_length)
/*+2*powers (2,reg_length)*/);
/*2 source operands in EXE; Assume 2EXE stages* since we do not really distinguish OP*/
num_piperegs += coredynp.issueW *
(2 * coredynp.int_data_width + pow(2.0, opcode_length)
/*+2*powers (2,reg_length)*/);
/* pipe stage EXE/MEM, data need to be read/write, address*/
//memory Opcode still need to be passed
num_piperegs += coredynp.issueW *
(coredynp.int_data_width + coredynp.v_address_width +
pow(2.0, opcode_length)/*+2*powers (2,reg_length)*/);
/* pipe stage MEM/WB; result data, writeback regs */
num_piperegs += coredynp.issueW *
(coredynp.int_data_width + coredynp.phy_ireg_width
/* powers (2,opcode_length) +
(2,opcode_length)+2*powers (2,reg_length)*/);
/* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top*/
num_piperegs += coredynp.commitW *
(coredynp.int_data_width + coredynp.v_address_width +
coredynp.phy_ireg_width
/*+ powers (2,opcode_length)*2*powers (2,reg_length)*/) *
coredynp.num_hthreads;
num_stages = 12;
}
/* assume 50% extra in control registers and interrupt registers (rule of thumb) */
num_piperegs = num_piperegs * 1.5;
tot_stage_vector = num_piperegs;
per_stage_vector = tot_stage_vector / num_stages;
if (coredynp.core_ty == Inorder) {
if (coredynp.pipeline_stages > 6)
num_piperegs = per_stage_vector * coredynp.pipeline_stages;
} else { //OOO
if (coredynp.pipeline_stages > 12)
num_piperegs = per_stage_vector * coredynp.pipeline_stages;
}
}
}
FunctionalUnit::FunctionalUnit(XMLNode* _xml_data,
InputParameter* interface_ip_,
const CoreParameters & _core_params,
const CoreStatistics & _core_stats,
enum FU_type fu_type_)
: McPATComponent(_xml_data),
interface_ip(*interface_ip_), core_params(_core_params),
core_stats(_core_stats), fu_type(fu_type_) {
double area_t;
double leakage;
double gate_leakage;
double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
clockRate = core_params.clockRate;
uca_org_t result2;
// Temp name for the following function call
name = "Functional Unit";
result2 = init_interface(&interface_ip, name);
if (core_params.Embedded) {
if (fu_type == FPU) {
num_fu=core_params.num_fpus;
//area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number
//4.47 contains both VFP and NEON processing unit, VFP is about 40% and NEON is about 60%
if (g_ip->F_sz_nm>90)
area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
//energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles.
// base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
// base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
base_energy = 0;
per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per Hz energy(nJ)
//FPU power from Sandia's processor sizing tech report
FU_height=(18667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
} else if (fu_type == ALU) {
num_fu=core_params.num_alus;
area_t = 280*260*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
// base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
// base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
base_energy = 0;
per_access_energy = 1.15/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU
} else if (fu_type == MUL) {
num_fu=core_params.num_muls;
area_t = 280*260*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
// base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
// base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
base_energy = 0;
per_access_energy = 1.15*2/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
} else {
cout<<"Unknown Functional Unit Type"<<endl;
exit(0);
}
per_access_energy *=0.5;//According to ARM data embedded processor has much lower per acc energy
} else {
if (fu_type == FPU) {
name = "Floating Point Unit(s)";
num_fu = core_params.num_fpus;
area_t = 8.47 * 1e6 * (g_ip->F_sz_nm * g_ip->F_sz_nm / 90.0 /
90.0);//this is um^2
if (g_ip->F_sz_nm > 90)
area_t = 8.47 * 1e6 *
g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
//W The base energy of ALU average numbers from Intel 4G and
//773Mhz (Wattch)
base_energy = core_params.core_ty == Inorder ? 0 : 89e-3 * 3;
base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 /
1.2);
per_access_energy = 1.15*3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per op energy(nJ)
FU_height=(38667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
} else if (fu_type == ALU) {
name = "Integer ALU(s)";
num_fu = core_params.num_alus;
//this is um^2 ALU + MUl
area_t = 280 * 260 * 2 * g_tp.scaling_factor.logic_scaling_co_eff;
leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
//W The base energy of ALU average numbers from Intel 4G and 773Mhz
//(Wattch)
base_energy = core_params.core_ty == Inorder ? 0 : 89e-3;
base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 /
1.2);
per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU
} else if (fu_type == MUL) {
name = "Multiply/Divide Unit(s)";
num_fu = core_params.num_muls;
//this is um^2 ALU + MUl
area_t = 280 * 260 * 2 * 3 *
g_tp.scaling_factor.logic_scaling_co_eff;
leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
//W The base energy of ALU average numbers from Intel 4G and 773Mhz
//(Wattch)
base_energy = core_params.core_ty == Inorder ? 0 : 89e-3 * 2;
base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 /
1.2);
per_access_energy = 1.15*2/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
} else {
cout << "Unknown Functional Unit Type" << endl;
exit(0);
}
}
area.set_area(area_t*num_fu);
power.readOp.leakage = leakage * num_fu;
power.readOp.gate_leakage = gate_leakage * num_fu;
double long_channel_device_reduction =
longer_channel_device_reduction(Core_device, core_params.core_ty);
power.readOp.longer_channel_leakage =
power.readOp.leakage * long_channel_device_reduction;
double macro_layout_overhead = g_tp.macro_layout_overhead;
area.set_area(area.get_area()*macro_layout_overhead);
}
void FunctionalUnit::computeEnergy() {
double pppm_t[4] = {1, 1, 1, 1};
double FU_duty_cycle;
double sckRation = g_tp.sckt_co_eff;
// TDP power calculation
//2 means two source operands needs to be passed for each int instruction.
set_pppm(pppm_t, 2, 2, 2, 2);
tdp_stats.readAc.access = num_fu;
if (fu_type == FPU) {
FU_duty_cycle = core_stats.FPU_duty_cycle;
} else if (fu_type == ALU) {
FU_duty_cycle = core_stats.ALU_duty_cycle;
} else if (fu_type == MUL) {
FU_duty_cycle = core_stats.MUL_duty_cycle;
}
power.readOp.dynamic =
per_access_energy * tdp_stats.readAc.access + base_energy / clockRate;
power.readOp.dynamic *= sckRation * FU_duty_cycle;
// Runtime power calculation
if (fu_type == FPU) {
rtp_stats.readAc.access = core_stats.fpu_accesses;
} else if (fu_type == ALU) {
rtp_stats.readAc.access = core_stats.ialu_accesses;
} else if (fu_type == MUL) {
rtp_stats.readAc.access = core_stats.mul_accesses;
}
rt_power.readOp.dynamic = per_access_energy * rtp_stats.readAc.access +
base_energy * execution_time;
rt_power.readOp.dynamic *= sckRation;
output_data.area = area.get_area() / 1e6;
output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
output_data.subthreshold_leakage_power =
(longer_channel_device) ? power.readOp.longer_channel_leakage :
power.readOp.leakage;
output_data.gate_leakage_power = power.readOp.gate_leakage;
output_data.runtime_dynamic_energy = rt_power.readOp.dynamic;
}
void FunctionalUnit::leakage_feedback(double temperature)
{
// Update the temperature and initialize the global interfaces.
interface_ip.temp = (unsigned int)round(temperature/10.0)*10;
// init_result is dummy
uca_org_t init_result = init_interface(&interface_ip, name);
// This is part of FunctionalUnit()
double area_t, leakage, gate_leakage;
double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
if (fu_type == FPU)
{
area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number
if (g_ip->F_sz_nm>90)
area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
}
else if (fu_type == ALU)
{
area_t = 280*260*2*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
}
else if (fu_type == MUL)
{
area_t = 280*260*2*3*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
}
else
{
cout<<"Unknown Functional Unit Type"<<endl;
exit(1);
}
power.readOp.leakage = leakage*num_fu;
power.readOp.gate_leakage = gate_leakage*num_fu;
power.readOp.longer_channel_leakage =
longer_channel_device_reduction(Core_device, core_params.core_ty);
}
UndiffCore::UndiffCore(XMLNode* _xml_data, InputParameter* interface_ip_,
const CoreParameters & dyn_p_,
bool exist_)
: McPATComponent(_xml_data),
interface_ip(*interface_ip_), coredynp(dyn_p_),
core_ty(coredynp.core_ty), embedded(coredynp.Embedded),
pipeline_stage(coredynp.pipeline_stages),
num_hthreads(coredynp.num_hthreads), issue_width(coredynp.issueW),
exist(exist_) {
if (!exist) return;
name = "Undifferentiated Core";
clockRate = coredynp.clockRate;
double undifferentiated_core = 0;
double core_tx_density = 0;
double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
double undifferentiated_core_coe;
uca_org_t result2;
result2 = init_interface(&interface_ip, name);
//Compute undifferentiated core area at 90nm.
if (embedded == false) {
//Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements
if (core_ty == OOO) {
undifferentiated_core = (3.57 * log(pipeline_stage) - 1.2643) > 0 ?
(3.57 * log(pipeline_stage) - 1.2643) : 0;
} else if (core_ty == Inorder) {
undifferentiated_core = (-2.19 * log(pipeline_stage) + 6.55) > 0 ?
(-2.19 * log(pipeline_stage) + 6.55) : 0;
} else {
cout << "invalid core type" << endl;
exit(0);
}
undifferentiated_core *= (1 + logtwo(num_hthreads) * 0.0716);
} else {
//Based on the results in paper "parametrized processor models" Sandia Labs
if (opt_for_clk)
undifferentiated_core_coe = 0.05;
else
undifferentiated_core_coe = 0;
undifferentiated_core = (0.4109 * pipeline_stage - 0.776) *
undifferentiated_core_coe;
undifferentiated_core *= (1 + logtwo(num_hthreads) * 0.0426);
}
undifferentiated_core *= g_tp.scaling_factor.logic_scaling_co_eff *
1e6;//change from mm^2 to um^2
core_tx_density = g_tp.scaling_factor.core_tx_density;
power.readOp.leakage = undifferentiated_core*(core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
power.readOp.gate_leakage = undifferentiated_core*(core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;
double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
power.readOp.longer_channel_leakage =
power.readOp.leakage * long_channel_device_reduction;
area.set_area(undifferentiated_core);
scktRatio = g_tp.sckt_co_eff;
power.readOp.dynamic *= scktRatio;
power.writeOp.dynamic *= scktRatio;
power.searchOp.dynamic *= scktRatio;
macro_PR_overhead = g_tp.macro_layout_overhead;
area.set_area(area.get_area()*macro_PR_overhead);
output_data.area = area.get_area() / 1e6;
output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
output_data.subthreshold_leakage_power =
longer_channel_device ? power.readOp.longer_channel_leakage :
power.readOp.leakage;
output_data.gate_leakage_power = power.readOp.gate_leakage;
}
InstructionDecoder::InstructionDecoder(XMLNode* _xml_data, const string _name,
bool _is_default,
const InputParameter *configure_interface,
int opcode_length_, int num_decoders_,
bool x86_,
double clockRate_,
enum Device_ty device_ty_,
enum Core_type core_ty_)
: McPATComponent(_xml_data), is_default(_is_default),
opcode_length(opcode_length_), num_decoders(num_decoders_), x86(x86_),
device_ty(device_ty_), core_ty(core_ty_) {
/*
* Instruction decoder is different from n to 2^n decoders
* that are commonly used in row decoders in memory arrays.
* The RISC instruction decoder is typically a very simple device.
* We can decode an instruction by simply
* separating the machine word into small parts using wire slices
* The RISC instruction decoder can be approximate by the n to 2^n decoders,
* although this approximation usually underestimate power since each decoded
* instruction normally has more than 1 active signal.
*
* However, decoding a CISC instruction word is much more difficult
* than the RISC case. A CISC decoder is typically set up as a state machine.
* The machine reads the opcode field to determine
* what type of instruction it is,
* and where the other data values are.
* The instruction word is read in piece by piece,
* and decisions are made at each stage as to
* how the remainder of the instruction word will be read.
* (sequencer and ROM are usually needed)
* An x86 decoder can be even more complex since
* it involve both decoding instructions into u-ops and
* merge u-ops when doing micro-ops fusion.
*/
name = _name;
clockRate = clockRate_;
bool is_dram = false;
double pmos_to_nmos_sizing_r;
double load_nmos_width, load_pmos_width;
double C_driver_load, R_wire_load;
Area cell;
l_ip = *configure_interface;
local_result = init_interface(&l_ip, name);
cell.h = g_tp.cell_h_def;
cell.w = g_tp.cell_h_def;
num_decoder_segments = (int)ceil(opcode_length / 18.0);
if (opcode_length > 18) opcode_length = 18;
num_decoded_signals = (int)pow(2.0, opcode_length);
pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
load_nmos_width = g_tp.max_w_nmos_ / 2;
load_pmos_width = g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r;
C_driver_load = 1024 * gate_C(load_nmos_width + load_pmos_width, 0, is_dram);
R_wire_load = 3000 * l_ip.F_sz_um * g_tp.wire_outside_mat.R_per_um;
final_dec = new Decoder(
num_decoded_signals,
false,
C_driver_load,
R_wire_load,
false/*is_fa*/,
false/*is_dram*/,
false/*wl_tr*/, //to use peri device
cell);
PredecBlk * predec_blk1 = new PredecBlk(
num_decoded_signals,
final_dec,
0,//Assuming predec and dec are back to back
0,
1,//Each Predec only drives one final dec
false/*is_dram*/,
true);
PredecBlk * predec_blk2 = new PredecBlk(
num_decoded_signals,
final_dec,
0,//Assuming predec and dec are back to back
0,
1,//Each Predec only drives one final dec
false/*is_dram*/,
false);
PredecBlkDrv * predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false);
PredecBlkDrv * predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false);
pre_dec = new Predec(predec_blk_drv1, predec_blk_drv2);
double area_decoder = final_dec->area.get_area() * num_decoded_signals *
num_decoder_segments * num_decoders;
//double w_decoder = area_decoder / area.get_h();
double area_pre_dec = (predec_blk_drv1->area.get_area() +
predec_blk_drv2->area.get_area() +
predec_blk1->area.get_area() +
predec_blk2->area.get_area()) *
num_decoder_segments * num_decoders;
area.set_area(area.get_area() + area_decoder + area_pre_dec);
double macro_layout_overhead = g_tp.macro_layout_overhead;
double chip_PR_overhead = g_tp.chip_layout_overhead;
area.set_area(area.get_area()*macro_layout_overhead*chip_PR_overhead);
inst_decoder_delay_power();
double sckRation = g_tp.sckt_co_eff;
power.readOp.dynamic *= sckRation;
power.writeOp.dynamic *= sckRation;
power.searchOp.dynamic *= sckRation;
double long_channel_device_reduction =
longer_channel_device_reduction(device_ty, core_ty);
power.readOp.longer_channel_leakage = power.readOp.leakage *
long_channel_device_reduction;
output_data.area = area.get_area() / 1e6;
output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
output_data.subthreshold_leakage_power = power.readOp.leakage;
output_data.gate_leakage_power = power.readOp.gate_leakage;
}
void InstructionDecoder::inst_decoder_delay_power() {
double dec_outrisetime;
double inrisetime = 0, outrisetime;
double pppm_t[4] = {1, 1, 1, 1};
double squencer_passes = x86 ? 2 : 1;
outrisetime = pre_dec->compute_delays(inrisetime);
dec_outrisetime = final_dec->compute_delays(outrisetime);
set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);
power = power + pre_dec->power * pppm_t;
set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,
num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments);
power = power + final_dec->power * pppm_t;
}
void InstructionDecoder::leakage_feedback(double temperature) {
l_ip.temp = (unsigned int)round(temperature/10.0)*10;
uca_org_t init_result = init_interface(&l_ip, name); // init_result is dummy
final_dec->leakage_feedback(temperature);
pre_dec->leakage_feedback(temperature);
double pppm_t[4] = {1,1,1,1};
double squencer_passes = x86?2:1;
set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);
power = pre_dec->power*pppm_t;
set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments);
power = power + final_dec->power*pppm_t;
double sckRation = g_tp.sckt_co_eff;
power.readOp.dynamic *= sckRation;
power.writeOp.dynamic *= sckRation;
power.searchOp.dynamic *= sckRation;
double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;
}
InstructionDecoder::~InstructionDecoder() {
local_result.cleanup();
delete final_dec;
delete pre_dec->blk1;
delete pre_dec->blk2;
delete pre_dec->drv1;
delete pre_dec->drv2;
delete pre_dec;
}