gem5/ext/mcpat/logic.cc

/*****************************************************************************
 *                                McPAT
 *                      SOFTWARE LICENSE AGREEMENT
 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
 *            Copyright (c) 2010-2013 Advanced Micro Devices, Inc.
 *                          All Rights Reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ***************************************************************************/

#include "common.h"
#include "logic.h"

//selection_logic
selection_logic::selection_logic(XMLNode* _xml_data, bool _is_default,
                                 int _win_entries, int issue_width_,
                                 const InputParameter *configure_interface,
                                 string _name, double _accesses,
                                 double clockRate_, enum Device_ty device_ty_,
                                 enum Core_type core_ty_)
    : McPATComponent(_xml_data), is_default(_is_default),
      win_entries(_win_entries),
      issue_width(issue_width_),
      accesses(_accesses),
      device_ty(device_ty_),
      core_ty(core_ty_) {
    clockRate = clockRate_;
    name = _name;
    l_ip = *configure_interface;
    local_result = init_interface(&l_ip, name);
}

void selection_logic::computeArea() {
    output_data.area = local_result.area;
}

void selection_logic::computeEnergy() {
    //based on cost effective superscalar processor TR pp27-31
    double Ctotal, Cor, Cpencode;
    int num_arbiter;
    double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp;

    //the 0.8um process data is used.
    //this was 10 micron for the 0.8 micron process
    WSelORn	= 12.5 * l_ip.F_sz_um;
    //this was 40 micron for the 0.8 micron process
    WSelORprequ = 50 * l_ip.F_sz_um;
    //this was 10mcron for the 0.8 micron process
    WSelPn = 12.5 * l_ip.F_sz_um;
    //this was 15 micron for the 0.8 micron process
    WSelPp = 18.75 * l_ip.F_sz_um;
    //this was 5 micron for the 0.8 micron process
    WSelEnn	= 6.25 * l_ip.F_sz_um;
    //this was 10 micron for the 0.8 micron process
    WSelEnp	= 12.5 * l_ip.F_sz_um;

    Ctotal = 0;
    num_arbiter = 1;
    while (win_entries > 4) {
        win_entries = (int)ceil((double)win_entries / 4.0);
        num_arbiter += win_entries;
    }
    //the 4-input OR logic to generate anyreq
    Cor = 4 * drain_C_(WSelORn, NCH, 1, 1, g_tp.cell_h_def) +
        drain_C_(WSelORprequ, PCH, 1, 1, g_tp.cell_h_def);
    power.readOp.gate_leakage =
        cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor) * g_tp.peri_global.Vdd;

    //The total capacity of the 4-bit priority encoder
    Cpencode = drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
        drain_C_(WSelPp, PCH, 1, 1, g_tp.cell_h_def) +
        2 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
        drain_C_(WSelPp, PCH, 2, 1, g_tp.cell_h_def) +
        3 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
        drain_C_(WSelPp, PCH, 3, 1, g_tp.cell_h_def) +
        4 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
        drain_C_(WSelPp, PCH, 4, 1, g_tp.cell_h_def) +//precompute priority logic
        2 * 4 * gate_C(WSelEnn + WSelEnp, 20.0) +
        4 * drain_C_(WSelEnn, NCH, 1, 1, g_tp.cell_h_def) +
        2 * 4 * drain_C_(WSelEnp, PCH, 1, 1, g_tp.cell_h_def) +//enable logic
        (2 * 4 + 2 * 3 + 2 * 2 + 2) *
        gate_C(WSelPn + WSelPp, 10.0);//requests signal

    Ctotal += issue_width * num_arbiter * (Cor + Cpencode);

    //2 means the abitration signal need to travel round trip
    power.readOp.dynamic =
        Ctotal * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 2;
    power.readOp.leakage = issue_width * num_arbiter *
        (cmos_Isub_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
         + cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)//grant2p
         + cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)//grant3p
         + cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
         + cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant sIsubnals
            ) * g_tp.peri_global.Vdd;
    power.readOp.gate_leakage = issue_width * num_arbiter *
        (cmos_Ig_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
         + cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)//grant2p
         + cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)//grant3p
         + cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
         + cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant signals
            ) * g_tp.peri_global.Vdd;
    double sckRation = g_tp.sckt_co_eff;
    power.readOp.dynamic *= sckRation;
    power.writeOp.dynamic *= sckRation;
    power.searchOp.dynamic *= sckRation;

    double long_channel_device_reduction =
        longer_channel_device_reduction(device_ty, core_ty);
    power.readOp.longer_channel_leakage =
        power.readOp.leakage * long_channel_device_reduction;

    output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
    output_data.subthreshold_leakage_power = power.readOp.leakage;
    output_data.gate_leakage_power = power.readOp.gate_leakage;
    output_data.runtime_dynamic_energy = power.readOp.dynamic * accesses;
}

dep_resource_conflict_check::dep_resource_conflict_check(
    XMLNode* _xml_data, const string _name,
    const InputParameter *configure_interface,
    const CoreParameters & dyn_p_, int compare_bits_,
    double clockRate_, bool _is_default)
    : McPATComponent(_xml_data), l_ip(*configure_interface),
      coredynp(dyn_p_), compare_bits(compare_bits_), is_default(_is_default) {

    name = _name;
    clockRate = clockRate_;
    //this was 20.0 micron for the 0.8 micron process
    Wcompn = 25 * l_ip.F_sz_um;
    //this was 20.0 micron for the 0.8 micron process
    Wevalinvp = 25 * l_ip.F_sz_um;
    //this was 80.0 mcron for the 0.8 micron process
    Wevalinvn = 100 * l_ip.F_sz_um;
    //this was 40.0  micron for the 0.8 micron process
    Wcomppreequ = 50 * l_ip.F_sz_um;
    //this was 5.4 micron for the 0.8 micron process
    WNORn =	6.75 * l_ip.F_sz_um;
    //this was 30.5 micron for the 0.8 micron process
    WNORp =	38.125 * l_ip.F_sz_um;

    // To make CACTI happy.
    l_ip.cache_sz = MIN_BUFFER_SIZE;
    local_result = init_interface(&l_ip, name);

    if (coredynp.core_ty == Inorder)
        //TODO: opcode bits + log(shared resources) + REG TAG BITS -->
        //opcode comparator
        compare_bits += 16 + 8 + 8;
    else
        compare_bits += 16 + 8 + 8;

    conflict_check_power();
    double sckRation = g_tp.sckt_co_eff;
    power.readOp.dynamic *= sckRation;
    power.writeOp.dynamic *= sckRation;
    power.searchOp.dynamic *= sckRation;

}

void dep_resource_conflict_check::conflict_check_power() {
    double Ctotal;
    int num_comparators;
    //2(N*N-N) is used for source to dest comparison, (N*N-N) is used for
    //dest to dest comparision.
    num_comparators = 3 * ((coredynp.decodeW) * (coredynp.decodeW) -
                           coredynp.decodeW);

    Ctotal = num_comparators * compare_cap();

    power.readOp.dynamic = Ctotal * /*CLOCKRATE*/ g_tp.peri_global.Vdd *
        g_tp.peri_global.Vdd /*AF*/;
    power.readOp.leakage = num_comparators * compare_bits * 2 *
        simplified_nmos_leakage(Wcompn,  false);

    double long_channel_device_reduction =
        longer_channel_device_reduction(Core_device, coredynp.core_ty);
    power.readOp.longer_channel_leakage	=
        power.readOp.leakage * long_channel_device_reduction;
    power.readOp.gate_leakage = num_comparators * compare_bits * 2 *
        cmos_Ig_leakage(Wcompn, 0, 2, nmos);

}

/* estimate comparator power consumption (this comparator is similar
   to the tag-match structure in a CAM */
double dep_resource_conflict_check::compare_cap() {
    double c1, c2;

    //resize the big NOR gate at the DCL according to fan in.
    WNORp = WNORp * compare_bits / 2.0;
    /* bottom part of comparator */
    c2 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) +
                           drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def)) +
        drain_C_(Wevalinvp, PCH, 1, 1, g_tp.cell_h_def) +
        drain_C_(Wevalinvn, NCH, 1, 1, g_tp.cell_h_def);

    /* top part of comparator */
    c1 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) +
                           drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def) +
                           drain_C_(Wcomppreequ, NCH, 1, 1, g_tp.cell_h_def)) +
        gate_C(WNORn + WNORp, 10.0) +
        drain_C_(WNORp, NCH, 2, 1, g_tp.cell_h_def) + compare_bits *
        drain_C_(WNORn, NCH, 2, 1, g_tp.cell_h_def);
    return(c1 + c2);

}

void dep_resource_conflict_check::leakage_feedback(double temperature)
{
  l_ip.temp = (unsigned int)round(temperature/10.0)*10;
  uca_org_t init_result = init_interface(&l_ip, name); // init_result is dummy

  // This is part of conflict_check_power()
  // 2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest
  // to dest comparison.
  int num_comparators = 3 * ((coredynp.decodeW) * (coredynp.decodeW) -
                             coredynp.decodeW);
  power.readOp.leakage = num_comparators * compare_bits * 2 *
      simplified_nmos_leakage(Wcompn,  false);

  double long_channel_device_reduction =
      longer_channel_device_reduction(Core_device, coredynp.core_ty);
  power.readOp.longer_channel_leakage = power.readOp.leakage *
      long_channel_device_reduction;
  power.readOp.gate_leakage = num_comparators * compare_bits * 2 *
      cmos_Ig_leakage(Wcompn, 0, 2, nmos);
}


DFFCell::DFFCell(
    bool _is_dram,
    double _WdecNANDn,
    double _WdecNANDp,
    double _cell_load,
    const InputParameter *configure_interface)
        : is_dram(_is_dram),
        cell_load(_cell_load),
        WdecNANDn(_WdecNANDn),
        WdecNANDp(_WdecNANDp) { //this model is based on the NAND2 based DFF.
    l_ip = *configure_interface;
    area.set_area(5 * compute_gate_area(NAND, 2,WdecNANDn,WdecNANDp,
                                        g_tp.cell_h_def)
                  + compute_gate_area(NAND, 2,WdecNANDn,WdecNANDn,
                                      g_tp.cell_h_def));


}


double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out) {
    double Ctotal = 0;

    /* part 1: drain cap of NAND gate */
    Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram);

    /* part 2: gate cap of NAND gates */
    Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);

    return Ctotal;
}


void DFFCell::compute_DFF_cell() {
    double c1, c2, c3, c4, c5, c6;
    /* node 5 and node 6 are identical to node 1 in capacitance */
    c1 = c5 = c6 = fpfp_node_cap(2, 1);
    c2 = fpfp_node_cap(2, 3);
    c3 = fpfp_node_cap(3, 2);
    c4 = fpfp_node_cap(2, 2);

    //cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2
    clock_cap = 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
    e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2 * cell_load) *
        0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;

    /* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle */
    e_keep_1.readOp.dynamic +=
        c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
    e_keep_0.readOp.dynamic +=
        c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
    e_clock.readOp.dynamic +=
        clock_cap * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;

    /* static power */
    e_switch.readOp.leakage +=
        (cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand) *
         5//5 NAND2 and 1 NAND3 in a DFF
         + cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand)) *
        g_tp.peri_global.Vdd;
    e_switch.readOp.gate_leakage +=
        (cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand) *
         5//5 NAND2 and 1 NAND3 in a DFF
         + cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand)) *
        g_tp.peri_global.Vdd;
}

Pipeline::Pipeline(XMLNode* _xml_data,
                   const InputParameter *configure_interface,
                   const CoreParameters & dyn_p_,
                   enum Device_ty device_ty_,
                   bool _is_core_pipeline,
                   bool _is_default)
    : McPATComponent(_xml_data), l_ip(*configure_interface),
      coredynp(dyn_p_), device_ty(device_ty_),
      is_core_pipeline(_is_core_pipeline), is_default(_is_default),
      num_piperegs(0.0) {
    name = "Pipeline?";

    local_result = init_interface(&l_ip, name);
    if (!coredynp.Embedded) {
        process_ind = true;
    } else {
        process_ind = false;
    }
    //this was  20 micron for the 0.8 micron process
    WNANDn = (process_ind) ? 25 * l_ip.F_sz_um : g_tp.min_w_nmos_ ;
    //this was  30 micron for the 0.8 micron process
    WNANDp = (process_ind) ? 37.5 * l_ip.F_sz_um : g_tp.min_w_nmos_ *
        pmos_to_nmos_sz_ratio();
    load_per_pipeline_stage = 2 * gate_C(WNANDn + WNANDp, 0, false);
    compute();

}

void Pipeline::compute() {
    compute_stage_vector();
    DFFCell pipe_reg(false, WNANDn, WNANDp, load_per_pipeline_stage, &l_ip);
    pipe_reg.compute_DFF_cell();

    double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic;
    //******************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider
    //the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power.
    double pipe_reg_power = num_piperegs *
        (pipe_reg.e_switch.readOp.dynamic + pipe_reg.e_keep_0.readOp.dynamic +
         pipe_reg.e_keep_1.readOp.dynamic) / 3 + clock_power_pipereg;
    double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage;
    double pipe_reg_gate_leakage = num_piperegs *
        pipe_reg.e_switch.readOp.gate_leakage;
    power.readOp.dynamic	+= pipe_reg_power;
    power.readOp.leakage	+= pipe_reg_leakage;
    power.readOp.gate_leakage	+= pipe_reg_gate_leakage;
    area.set_area(num_piperegs * pipe_reg.area.get_area());

    double long_channel_device_reduction =
        longer_channel_device_reduction(device_ty, coredynp.core_ty);
    power.readOp.longer_channel_leakage	= power.readOp.leakage *
        long_channel_device_reduction;


    double sckRation = g_tp.sckt_co_eff;
    power.readOp.dynamic *= sckRation;
    power.writeOp.dynamic *= sckRation;
    power.searchOp.dynamic *= sckRation;
    double macro_layout_overhead = g_tp.macro_layout_overhead;
        if (!coredynp.Embedded)
                area.set_area(area.get_area() * macro_layout_overhead);

    output_data.area = area.get_area() / 1e6;
    output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
    output_data.subthreshold_leakage_power = power.readOp.leakage;
    output_data.gate_leakage_power = power.readOp.gate_leakage;
    output_data.runtime_dynamic_energy = power.readOp.dynamic * total_cycles;
}

void Pipeline::compute_stage_vector() {
    double num_stages, tot_stage_vector, per_stage_vector;
    int opcode_length = coredynp.x86 ?
        coredynp.micro_opcode_length : coredynp.opcode_width;

    if (!is_core_pipeline) {
        //The number of pipeline stages are calculated based on the achievable
        //throughput and required throughput
        num_piperegs = l_ip.pipeline_stages * l_ip.per_stage_vector;
    } else {
        if (coredynp.core_ty == Inorder) {
            /* assume 6 pipe stages and try to estimate bits per pipe stage */
            /* pipe stage 0/IF */
            num_piperegs += coredynp.pc_width * 2 * coredynp.num_hthreads;
            /* pipe stage IF/ID */
            num_piperegs += coredynp.fetchW *
                (coredynp.instruction_length + coredynp.pc_width) *
                coredynp.num_hthreads;
            /* pipe stage IF/ThreadSEL */
            if (coredynp.multithreaded) {
                num_piperegs += coredynp.num_hthreads *
                    coredynp.perThreadState; //8 bit thread states
            }
            /* pipe stage ID/EXE */
            num_piperegs += coredynp.decodeW *
                (coredynp.instruction_length + coredynp.pc_width +
                 pow(2.0, opcode_length) + 2 * coredynp.int_data_width) *
                coredynp.num_hthreads;
            /* pipe stage EXE/MEM */
            num_piperegs += coredynp.issueW *
                (3 * coredynp.arch_ireg_width + pow(2.0, opcode_length) + 8 *
                 2 * coredynp.int_data_width/*+2*powers (2,reg_length)*/);
            /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode*/
            num_piperegs += coredynp.issueW *
                (2 * coredynp.int_data_width + pow(2.0, opcode_length) + 8 *
                 2 * coredynp.int_data_width/*+2*powers (2,reg_length)*/);
            num_stages = 6;
        } else {
            /* assume 12 stage pipe stages and try to estimate bits per pipe stage */
            /*OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM */

            /* pipe stage 0/1F*/
            num_piperegs +=
                coredynp.pc_width * 2 * coredynp.num_hthreads ;//PC and Next PC
            /* pipe stage IF/ID */
            num_piperegs += coredynp.fetchW *
                (coredynp.instruction_length + coredynp.pc_width) *
                coredynp.num_hthreads;//PC is used to feed branch predictor in ID
            /* pipe stage 1D/Renaming*/
            num_piperegs += coredynp.decodeW *
                (coredynp.instruction_length + coredynp.pc_width) *
                coredynp.num_hthreads;//PC is for branch exe in later stage.
            /* pipe stage Renaming/wire_drive */
            num_piperegs += coredynp.decodeW *
                (coredynp.instruction_length + coredynp.pc_width);
            /* pipe stage Renaming/IssueQ */
            //3*coredynp.phy_ireg_width means 2 sources and 1 dest
            num_piperegs += coredynp.issueW *
                (coredynp.instruction_length  + coredynp.pc_width + 3 *
                 coredynp.phy_ireg_width) * coredynp.num_hthreads;
            /* pipe stage IssueQ/Dispatch */
            num_piperegs += coredynp.issueW *
                (coredynp.instruction_length + 3 * coredynp.phy_ireg_width);
            /* pipe stage Dispatch/EXE */

            num_piperegs += coredynp.issueW *
                (3 * coredynp.phy_ireg_width + coredynp.pc_width +
                 pow(2.0, opcode_length)/*+2*powers (2,reg_length)*/);
            /* 2^opcode_length means the total decoded signal for the opcode*/
            num_piperegs += coredynp.issueW *
                (2 * coredynp.int_data_width + pow(2.0, opcode_length)
                 /*+2*powers (2,reg_length)*/);
            /*2 source operands in EXE; Assume 2EXE stages* since we do not really distinguish OP*/
            num_piperegs += coredynp.issueW *
                (2 * coredynp.int_data_width + pow(2.0, opcode_length)
                 /*+2*powers (2,reg_length)*/);
            /* pipe stage EXE/MEM, data need to be read/write, address*/
            //memory Opcode still need to be passed
            num_piperegs += coredynp.issueW *
                (coredynp.int_data_width + coredynp.v_address_width +
                 pow(2.0, opcode_length)/*+2*powers (2,reg_length)*/);
            /* pipe stage MEM/WB; result data, writeback regs */
            num_piperegs += coredynp.issueW *
                (coredynp.int_data_width + coredynp.phy_ireg_width
                 /* powers (2,opcode_length) +
                    (2,opcode_length)+2*powers (2,reg_length)*/);
            /* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top*/
            num_piperegs += coredynp.commitW *
                (coredynp.int_data_width + coredynp.v_address_width +
                 coredynp.phy_ireg_width
                 /*+ powers (2,opcode_length)*2*powers (2,reg_length)*/) *
                coredynp.num_hthreads;
            num_stages = 12;

        }

        /* assume 50% extra in control registers and interrupt registers (rule of thumb) */
        num_piperegs = num_piperegs * 1.5;
        tot_stage_vector = num_piperegs;
        per_stage_vector = tot_stage_vector / num_stages;

        if (coredynp.core_ty == Inorder) {
            if (coredynp.pipeline_stages > 6)
                num_piperegs = per_stage_vector * coredynp.pipeline_stages;
        } else { //OOO
            if (coredynp.pipeline_stages > 12)
                num_piperegs = per_stage_vector * coredynp.pipeline_stages;
        }
    }

}

FunctionalUnit::FunctionalUnit(XMLNode* _xml_data,
                               InputParameter* interface_ip_,
                               const CoreParameters & _core_params,
                               const CoreStatistics & _core_stats,
                               enum FU_type fu_type_)
    : McPATComponent(_xml_data),
      interface_ip(*interface_ip_), core_params(_core_params),
      core_stats(_core_stats), fu_type(fu_type_) {
    double area_t;
    double leakage;
    double gate_leakage;
    double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
    clockRate = core_params.clockRate;

    uca_org_t result2;
    // Temp name for the following function call
    name = "Functional Unit";

    result2 = init_interface(&interface_ip, name);

        if (core_params.Embedded) {
            if (fu_type == FPU) {
                num_fu=core_params.num_fpus;
                        //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
                        area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number
                        //4.47 contains both VFP and NEON processing unit, VFP is about 40% and NEON is about 60%
                        if (g_ip->F_sz_nm>90)
                                area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
                        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
                        gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
                        //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles.
//			base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
//			base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
                        base_energy = 0;
                        per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per Hz energy(nJ)
                        //FPU power from Sandia's processor sizing tech report
                        FU_height=(18667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
            } else if (fu_type == ALU) {
                num_fu=core_params.num_alus;
                        area_t = 280*260*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
                        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
                        gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
//			base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
//			base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
                        base_energy = 0;
                        per_access_energy = 1.15/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
                        FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU

            } else if (fu_type == MUL) {
                num_fu=core_params.num_muls;
                        area_t = 280*260*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
                        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
                        gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
//			base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
//			base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
                        base_energy = 0;
                        per_access_energy = 1.15*2/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
                        FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
            } else {
                        cout<<"Unknown Functional Unit Type"<<endl;
                        exit(0);
                }
                per_access_energy *=0.5;//According to ARM data embedded processor has much lower per acc energy
        } else {
            if (fu_type == FPU) {
                name = "Floating Point Unit(s)";
                num_fu = core_params.num_fpus;
                area_t = 8.47 * 1e6 * (g_ip->F_sz_nm * g_ip->F_sz_nm / 90.0 /
                                       90.0);//this is um^2
                if (g_ip->F_sz_nm > 90)
                    area_t = 8.47 * 1e6 *
                        g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
            leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
            gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
            //W The base energy of ALU average numbers from Intel 4G and
            //773Mhz (Wattch)
            base_energy = core_params.core_ty == Inorder ? 0 : 89e-3 * 3;
            base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 /
                            1.2);
            per_access_energy = 1.15*3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per op energy(nJ)
            FU_height=(38667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
        } else if (fu_type == ALU) {
            name = "Integer ALU(s)";
            num_fu = core_params.num_alus;
            //this is um^2 ALU + MUl
            area_t = 280 * 260 * 2 * g_tp.scaling_factor.logic_scaling_co_eff;
            leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
            gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
            //W The base energy of ALU average numbers from Intel 4G and 773Mhz
            //(Wattch)
            base_energy = core_params.core_ty == Inorder ? 0 : 89e-3;
            base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 /
                            1.2);
            per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
            FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU
        } else if (fu_type == MUL) {
            name = "Multiply/Divide Unit(s)";
            num_fu = core_params.num_muls;
            //this is um^2 ALU + MUl
            area_t = 280 * 260 * 2 * 3 *
                g_tp.scaling_factor.logic_scaling_co_eff;
            leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
            gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
            //W The base energy of ALU average numbers from Intel 4G and 773Mhz
            //(Wattch)
            base_energy = core_params.core_ty == Inorder ? 0 : 89e-3 * 2;
            base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 /
                            1.2);
            per_access_energy = 1.15*2/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
            FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
        } else {
            cout << "Unknown Functional Unit Type" << endl;
            exit(0);
        }
    }

    area.set_area(area_t*num_fu);
    power.readOp.leakage = leakage * num_fu;
    power.readOp.gate_leakage = gate_leakage * num_fu;

    double long_channel_device_reduction =
        longer_channel_device_reduction(Core_device, core_params.core_ty);
    power.readOp.longer_channel_leakage	=
        power.readOp.leakage * long_channel_device_reduction;
    double macro_layout_overhead = g_tp.macro_layout_overhead;
    area.set_area(area.get_area()*macro_layout_overhead);
}

void FunctionalUnit::computeEnergy() {
    double pppm_t[4]    = {1, 1, 1, 1};
    double FU_duty_cycle;
    double sckRation = g_tp.sckt_co_eff;

    // TDP power calculation
    //2 means two source operands needs to be passed for each int instruction.
    set_pppm(pppm_t, 2, 2, 2, 2);
    tdp_stats.readAc.access = num_fu;
    if (fu_type == FPU) {
        FU_duty_cycle = core_stats.FPU_duty_cycle;
    } else if (fu_type == ALU) {
        FU_duty_cycle = core_stats.ALU_duty_cycle;
    } else if (fu_type == MUL) {
        FU_duty_cycle = core_stats.MUL_duty_cycle;
    }

    power.readOp.dynamic =
        per_access_energy * tdp_stats.readAc.access + base_energy / clockRate;
    power.readOp.dynamic *= sckRation * FU_duty_cycle;

    // Runtime power calculation
    if (fu_type == FPU) {
        rtp_stats.readAc.access = core_stats.fpu_accesses;
    } else if (fu_type == ALU) {
        rtp_stats.readAc.access = core_stats.ialu_accesses;
    } else if (fu_type == MUL) {
        rtp_stats.readAc.access = core_stats.mul_accesses;
    }

    rt_power.readOp.dynamic = per_access_energy * rtp_stats.readAc.access +
        base_energy * execution_time;
    rt_power.readOp.dynamic *= sckRation;

    output_data.area = area.get_area() / 1e6;
    output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
    output_data.subthreshold_leakage_power =
        (longer_channel_device) ? power.readOp.longer_channel_leakage :
        power.readOp.leakage;
    output_data.gate_leakage_power = power.readOp.gate_leakage;
    output_data.runtime_dynamic_energy = rt_power.readOp.dynamic;
}

void FunctionalUnit::leakage_feedback(double temperature)
{
  // Update the temperature and initialize the global interfaces.
  interface_ip.temp = (unsigned int)round(temperature/10.0)*10;

  // init_result is dummy
  uca_org_t init_result = init_interface(&interface_ip, name);

  // This is part of FunctionalUnit()
  double area_t, leakage, gate_leakage;
  double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();

  if (fu_type == FPU)
  {
        area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number
        if (g_ip->F_sz_nm>90)
                area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
        gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
  }
  else if (fu_type == ALU)
  {
    area_t = 280*260*2*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
    leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
    gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
  }
  else if (fu_type == MUL)
  {
    area_t = 280*260*2*3*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
    leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
    gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
  }
  else
  {
    cout<<"Unknown Functional Unit Type"<<endl;
    exit(1);
  }

  power.readOp.leakage = leakage*num_fu;
  power.readOp.gate_leakage = gate_leakage*num_fu;
  power.readOp.longer_channel_leakage =
      longer_channel_device_reduction(Core_device, core_params.core_ty);
}

UndiffCore::UndiffCore(XMLNode* _xml_data, InputParameter* interface_ip_,
                       const CoreParameters & dyn_p_,
                       bool exist_)
        : McPATComponent(_xml_data),
        interface_ip(*interface_ip_), coredynp(dyn_p_),
        core_ty(coredynp.core_ty), embedded(coredynp.Embedded),
        pipeline_stage(coredynp.pipeline_stages),
        num_hthreads(coredynp.num_hthreads), issue_width(coredynp.issueW),
        exist(exist_) {
    if (!exist) return;

    name = "Undifferentiated Core";
    clockRate = coredynp.clockRate;

    double undifferentiated_core = 0;
    double core_tx_density = 0;
    double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
        double undifferentiated_core_coe;
    uca_org_t result2;
    result2 = init_interface(&interface_ip, name);

    //Compute undifferentiated core area at 90nm.
    if (embedded == false) {
        //Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements
        if (core_ty == OOO) {
            undifferentiated_core = (3.57 * log(pipeline_stage) - 1.2643) > 0 ?
                (3.57 * log(pipeline_stage) - 1.2643) : 0;
        } else if (core_ty == Inorder) {
            undifferentiated_core = (-2.19 * log(pipeline_stage) + 6.55) > 0 ?
                (-2.19 * log(pipeline_stage) + 6.55) : 0;
        } else {
            cout << "invalid core type" << endl;
            exit(0);
        }
        undifferentiated_core *= (1 + logtwo(num_hthreads) * 0.0716);
    } else {
        //Based on the results in paper "parametrized processor models" Sandia Labs
                if (opt_for_clk)
                        undifferentiated_core_coe = 0.05;
                else
                        undifferentiated_core_coe = 0;
                undifferentiated_core = (0.4109 * pipeline_stage - 0.776) *
                    undifferentiated_core_coe;
                undifferentiated_core *= (1 + logtwo(num_hthreads) * 0.0426);
    }

    undifferentiated_core *= g_tp.scaling_factor.logic_scaling_co_eff *
        1e6;//change from mm^2 to um^2
    core_tx_density                 = g_tp.scaling_factor.core_tx_density;
    power.readOp.leakage = undifferentiated_core*(core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
    power.readOp.gate_leakage = undifferentiated_core*(core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;

    double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
    power.readOp.longer_channel_leakage	=
        power.readOp.leakage * long_channel_device_reduction;
    area.set_area(undifferentiated_core);

    scktRatio = g_tp.sckt_co_eff;
    power.readOp.dynamic *= scktRatio;
    power.writeOp.dynamic *= scktRatio;
    power.searchOp.dynamic *= scktRatio;
    macro_PR_overhead = g_tp.macro_layout_overhead;
    area.set_area(area.get_area()*macro_PR_overhead);

    output_data.area = area.get_area() / 1e6;
    output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
    output_data.subthreshold_leakage_power =
        longer_channel_device ? power.readOp.longer_channel_leakage :
        power.readOp.leakage;
    output_data.gate_leakage_power = power.readOp.gate_leakage;
}

InstructionDecoder::InstructionDecoder(XMLNode* _xml_data, const string _name,
                                       bool _is_default,
                                       const InputParameter *configure_interface,
                                       int opcode_length_, int num_decoders_,
                                       bool x86_,
                                       double clockRate_,
                                       enum Device_ty device_ty_,
                                       enum Core_type core_ty_)
    : McPATComponent(_xml_data), is_default(_is_default),
      opcode_length(opcode_length_), num_decoders(num_decoders_), x86(x86_),
      device_ty(device_ty_), core_ty(core_ty_) {
    /*
     * Instruction decoder is different from n to 2^n decoders
     * that are commonly used in row decoders in memory arrays.
     * The RISC instruction decoder is typically a very simple device.
     * We can decode an instruction by simply
     * separating the machine word into small parts using wire slices
     * The RISC instruction decoder can be approximate by the n to 2^n decoders,
     * although this approximation usually underestimate power since each decoded
     * instruction normally has more than 1 active signal.
     *
     * However, decoding a CISC instruction word is much more difficult
     * than the RISC case. A CISC decoder is typically set up as a state machine.
     * The machine reads the opcode field to determine
     * what type of instruction it is,
     * and where the other data values are.
     * The instruction word is read in piece by piece,
     * and decisions are made at each stage as to
     * how the remainder of the instruction word will be read.
     * (sequencer and ROM are usually needed)
     * An x86 decoder can be even more complex since
     * it involve  both decoding instructions into u-ops and
     * merge u-ops when doing micro-ops fusion.
     */
    name = _name;
    clockRate = clockRate_;
    bool is_dram = false;
    double pmos_to_nmos_sizing_r;
    double load_nmos_width, load_pmos_width;
    double C_driver_load, R_wire_load;
    Area cell;

    l_ip = *configure_interface;
    local_result = init_interface(&l_ip, name);
    cell.h = g_tp.cell_h_def;
    cell.w = g_tp.cell_h_def;

    num_decoder_segments = (int)ceil(opcode_length / 18.0);
    if (opcode_length > 18)	opcode_length = 18;
    num_decoded_signals = (int)pow(2.0, opcode_length);
    pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
    load_nmos_width = g_tp.max_w_nmos_ / 2;
    load_pmos_width = g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r;
    C_driver_load = 1024 * gate_C(load_nmos_width + load_pmos_width, 0, is_dram);
    R_wire_load   = 3000 * l_ip.F_sz_um * g_tp.wire_outside_mat.R_per_um;

    final_dec = new Decoder(
        num_decoded_signals,
        false,
        C_driver_load,
        R_wire_load,
        false/*is_fa*/,
        false/*is_dram*/,
        false/*wl_tr*/, //to use peri device
        cell);

    PredecBlk * predec_blk1 = new PredecBlk(
        num_decoded_signals,
        final_dec,
        0,//Assuming predec and dec are back to back
        0,
        1,//Each Predec only drives one final dec
        false/*is_dram*/,
        true);
    PredecBlk * predec_blk2 = new PredecBlk(
        num_decoded_signals,
        final_dec,
        0,//Assuming predec and dec are back to back
        0,
        1,//Each Predec only drives one final dec
        false/*is_dram*/,
        false);

    PredecBlkDrv * predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false);
    PredecBlkDrv * predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false);

    pre_dec            = new Predec(predec_blk_drv1, predec_blk_drv2);

    double area_decoder = final_dec->area.get_area() * num_decoded_signals *
        num_decoder_segments * num_decoders;
    //double w_decoder    = area_decoder / area.get_h();
    double area_pre_dec = (predec_blk_drv1->area.get_area() +
                           predec_blk_drv2->area.get_area() +
                           predec_blk1->area.get_area() +
                           predec_blk2->area.get_area()) *
                          num_decoder_segments * num_decoders;
    area.set_area(area.get_area() + area_decoder + area_pre_dec);
    double macro_layout_overhead   = g_tp.macro_layout_overhead;
    double chip_PR_overhead        = g_tp.chip_layout_overhead;
    area.set_area(area.get_area()*macro_layout_overhead*chip_PR_overhead);

    inst_decoder_delay_power();

    double sckRation = g_tp.sckt_co_eff;
    power.readOp.dynamic *= sckRation;
    power.writeOp.dynamic *= sckRation;
    power.searchOp.dynamic *= sckRation;

    double long_channel_device_reduction =
        longer_channel_device_reduction(device_ty, core_ty);
    power.readOp.longer_channel_leakage	= power.readOp.leakage *
        long_channel_device_reduction;

    output_data.area = area.get_area() / 1e6;
    output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
    output_data.subthreshold_leakage_power = power.readOp.leakage;
    output_data.gate_leakage_power = power.readOp.gate_leakage;
}

void InstructionDecoder::inst_decoder_delay_power() {

    double dec_outrisetime;
    double inrisetime = 0, outrisetime;
    double pppm_t[4]    = {1, 1, 1, 1};
    double squencer_passes = x86 ? 2 : 1;

    outrisetime = pre_dec->compute_delays(inrisetime);
    dec_outrisetime = final_dec->compute_delays(outrisetime);
    set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);
    power = power + pre_dec->power * pppm_t;
    set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,
             num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments);
    power = power + final_dec->power * pppm_t;
}

void InstructionDecoder::leakage_feedback(double temperature) {
  l_ip.temp = (unsigned int)round(temperature/10.0)*10;
  uca_org_t init_result = init_interface(&l_ip, name); // init_result is dummy

  final_dec->leakage_feedback(temperature);
  pre_dec->leakage_feedback(temperature);

  double pppm_t[4]    = {1,1,1,1};
  double squencer_passes = x86?2:1;

  set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);
  power = pre_dec->power*pppm_t;

  set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments);
  power = power + final_dec->power*pppm_t;

  double sckRation = g_tp.sckt_co_eff;

  power.readOp.dynamic *= sckRation;
  power.writeOp.dynamic *= sckRation;
  power.searchOp.dynamic *= sckRation;

  double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
  power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;
}

InstructionDecoder::~InstructionDecoder() {
    local_result.cleanup();

    delete final_dec;

    delete pre_dec->blk1;
    delete pre_dec->blk2;
    delete pre_dec->drv1;
    delete pre_dec->drv2;
    delete pre_dec;
}