Power measurement - ARM Cortex-A53 - microarchitecture parameters

Hello! I’m doing simulation of power consumption of Snapdragon 410E processor (4 core ARM Cortex-A53). The simulator I use is McPAT http://www.hpl.hp.com/research/mcpat/ which calculates total power based upon microarchitecture parameters and cycle-by-cycle performance simulator from XML config file. But I can’t find information about some microarchitecture parameters bellow:

System:

number_of_NoCs (is it Network on Chip?)
homogeneous_ccs
interconnect_projection_type
opt_clockrate
longer_channel_device

System.core0:

opt_local
opcode_width
micro_opcode_width
fp_issue_width
prediction_width
instruction_buffer_size
IFU_duty_cycle
BR_duty_cycle
LSU_duty_cycle
MemManU_I_duty_cycle
ALU_duty_cycle
ALU_cdb_duty_cycle


system.L1Directory0:
Dir_config
buffer_sizes
clockrate
ports
device_type

I’m attaching XML config file with some parameters based on Cortex A9 config file. Or maybe there is already configuration file under A53.

Best regards and thanks for help.

Pawel K.

ARM_A53_SNAPDRAGON410_SOC.xml
<?xml version="1.0" ?>
<component id="root" name="root">
	<component id="system" name="system">
		<!--McPAT will skip the components if number is set to 0 -->
		<!--Duty cycles in this file are set according to "ARM MPcore
			ARchitecture performance Enhancement" in MPF Japan 2008 -->
		<param name="number_of_cores" value="4"/> <!-- cpu-info  -->
		<param name="number_of_L1Directories" value="4"/> <!-- cache-info  -->
		<param name="number_of_L2Directories" value="1"/> <!-- cache-info -->
		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can 
													be multiple banks/ports, cache-info -->
		<param name="Private_L2" value="0"/> <!--1 Private, 0 shared/coherent -->
		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
		<param name="number_of_NoCs" value="0"/>  <!-- NoC - Network on Chip -->
		<param name="homogeneous_cores" value="1"/> <!-- 4 same cores -->
		<param name="homogeneous_L2s" value="0"/>
		<param name="homogeneous_L1Directories" value="1"/>
		<param name="homogeneous_L2Directories" value="0"/>
		<param name="homogeneous_L3s" value="0"/>
		<param name="homogeneous_ccs" value="1"/><!--cache coherence hardware -->
		<param name="homogeneous_NoCs" value="0"/>
		<param name="core_tech_node" value="28"/><!-- nm -->
		<param name="target_core_clockrate" value="REPLACE{config.system.clk_domain.clock}"/><!--MHz -->
		<param name="temperature" value="340"/> <!-- Kelvin -->
		<param name="number_cache_levels" value="2"/>
		<param name="interconnect_projection_type" value="1"/><!--0: aggressive wire technology; 1: conservative wire technology -->
		<param name="device_type" value="2"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
		<param name="Embedded" value="1"/><!-- Embedded processor like ARM or general purpose processors?  -->
		<param name="opt_clockrate" value="1"/>
		<param name="machine_bits" value="64"/>
		<param name="virtual_address_width" value="64"/>
		<param name="physical_address_width" value="64"/>
		<param name="virtual_memory_page_size" value="4096"/>
		<!-- address width determines the tag_width in Cache, LSQ and buffers in cache controller 
			default value is machine_bits, if not set --> 
		<stat name="total_cycles" value="REPLACE{stats.system.cpu.numCycles}"/>
		<stat name="idle_cycles" value="REPLACE{stats.system.cpu.idleCycles}"/>
		<stat name="busy_cycles"  value="REPLACE{stats.system.cpu.numCycles - stats.system.cpu.idleCycles}"/>
			<!--This page size(B) is complete different from the page size in Main memo section. this page size is the size of 
			virtual memory from OS/Archi perspective; the page size in Main memo section is the actual physical line in a DRAM bank  -->
		<!-- *********************** cores ******************* -->
		<component id="system.core0" name="core0">
			<!-- Core property -->
			<param name="clock_rate" value="REPLACE{config.system.clk_domain.clock}"/>
			<!-- for cores with unknown timing, set to 0 to force off the opt flag -->
			<param name="opt_local" value="0"/>
			<param name="instruction_length" value="32"/>
			<param name="opcode_width" value="7"/>
			<param name="x86" value="0"/>
			<param name="micro_opcode_width" value="8"/>
			<param name="machine_type" value="0"/>
			<!-- inorder/OoO; 1 inorder; 0 OOO-->
			<param name="number_hardware_threads" value="REPLACE{config.system.cpu.numThreads}"/>
			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
			branch information in consecutive branch instructions in the same fetch group can be read out from BTB once.--> 
			<param name="fetch_width" value="REPLACE{config.system.cpu.fetchWidth}"/>
			<!-- fetch_width determines the size of cachelines of L1 cache block -->
			<param name="number_instruction_fetch_ports" value="1"/>
			<param name="decode_width" value="REPLACE{config.system.cpu.decodeWidth}"/>
			<!-- decode_width determines the number of ports of the 
			renaming table (both RAM and CAM) scheme -->
			<param name="issue_width" value="REPLACE{config.system.cpu.issueWidth}"/>
			<param name="peak_issue_width" value="REPLACE{config.system.cpu.issueWidth}"/>
			<!-- issue_width determines the number of ports of Issue window and other logic 
			as in the complexity effective processors paper; issue_width==dispatch_width -->
			<param name="commit_width" value="REPLACE{config.system.cpu.commitWidth}"/>
			<!-- commit_width determines the number of ports of register files -->
			<param name="fp_issue_width" value="1"/>
			<param name="prediction_width" value="1"/> 
			<!-- number of branch instructions can be predicted simultaneously-->
			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
			Theses parameters are reserved for future use.--> 
			<param name="pipelines_per_core" value="1,1"/>
			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
			<param name="pipeline_depth" value="8,8"/>
			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
			<!-- issue and exe unit-->
			<param name="ALU_per_core" value="3"/>
			<!-- contains an adder, a shifter, and a logical unit -->
			<param name="MUL_per_core" value="1"/>
			<!-- For MUL and Div -->
			<param name="FPU_per_core" value="1"/>		
			<!-- buffer between IF and ID stage -->
			<param name="instruction_buffer_size" value="32"/>
			<!-- buffer between ID and sche/exe stage -->
			<param name="decoded_stream_buffer_size" value="16"/>
			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
			<param name="instruction_window_size" value="REPLACE{config.system.cpu.numIQEntries}"/>
			<param name="fp_instruction_window_size" value="REPLACE{config.system.cpu.numIQEntries}"/>
			<!-- Numbers need to be confirmed -->
			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
			<param name="ROB_size" value="REPLACE{config.system.cpu.numROBEntries}"/>
			<!-- each in-flight instruction has an entry in ROB -->
			<!-- registers -->
			<param name="archi_Regs_IRF_size" value="32"/>			
			<param name="archi_Regs_FRF_size" value="32"/>
			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
			renaming logic is for both integer and floating point insts.  -->
			<param name="phy_Regs_IRF_size" value="REPLACE{config.system.cpu.numPhysIntRegs}"/>
			<param name="phy_Regs_FRF_size" value="REPLACE{config.system.cpu.numPhysFloatRegs}"/>
			<!-- rename logic -->
			<param name="rename_scheme" value="0"/>
			<!-- can be RAM based(0) or CAM based(1) rename scheme 
			RAM-based scheme will have free list, status table;
			CAM-based scheme have the valid bit in the data field of the CAM -->
			<param name="checkpoint_depth" value="1"/>
			<!-- RAM and CAM RAT contains checkpoints, checkpoint_depth=# of in_flight speculations;
			RAM-based RAT should not have more than 4 GCs (e.g., MIPS R10000). 
			McPAT assumes the exsistance of RRAT when the RAM-RAT having no GCs (e.g., Netburst)
			CAM-based RAT should have at least 1 GC and can have more than 8 GCs. -->
			<param name="register_windows_size" value="0"/>
			<!-- how many windows in the windowed register file, sun processors;
			no register windowing is used when this number is 0 -->
			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
			They will always try to execute out-of-order though. -->
			<param name="LSU_order" value="inorder"/>
			<param name="store_buffer_size" value="REPLACE{config.system.cpu.SQEntries}"/>
			<!-- By default, in-order cores do not have load buffers -->
			<param name="load_buffer_size" value="REPLACE{config.system.cpu.LQEntries}"/>	
			<!-- number of ports refer to sustain-able concurrent memory accesses --> 
			<param name="memory_ports" value="1"/>	
			<!-- max_allowed_in_flight_memo_instructions determines the # of ports of load and store buffer
			as well as the ports of Dcache which is connected to LSU -->	
			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
			<param name="RAS_size" value="REPLACE{config.system.cpu.branchPred.RASSize}"/>						
			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for sanity check  -->
			<!-- please note: if target architecture is X86, then all the instructions refer to (fused) micro-ops -->
			<stat name="total_instructions" value="REPLACE{stats.system.cpu.iq.iqInstsIssued}"/>
			<stat name="int_instructions" value="REPLACE{stats.system.cpu.iq.FU_type_0::No_OpClass + stats.system.cpu.iq.FU_type_0::IntAlu + stats.system.cpu.iq.FU_type_0::IntMult + stats.system.cpu.iq.FU_type_0::IntDiv + stats.system.cpu.iq.FU_type_0::IprAccess}"/>
			<stat name="fp_instructions" value="REPLACE{stats.system.cpu.iq.FU_type_0::FloatAdd + stats.system.cpu.iq.FU_type_0::FloatCmp + stats.system.cpu.iq.FU_type_0::FloatCvt + stats.system.cpu.iq.FU_type_0::FloatMult + stats.system.cpu.iq.FU_type_0::FloatDiv + stats.system.cpu.iq.FU_type_0::FloatSqrt}"/>
			<stat name="branch_instructions" value="REPLACE{stats.system.cpu.branchPred.condPredicted}"/>
			<stat name="branch_mispredictions" value="REPLACE{stats.system.cpu.branchPred.condIncorrect}"/>
			<stat name="load_instructions" value="REPLACE{stats.system.cpu.iq.FU_type_0::MemRead + stats.system.cpu.iq.FU_type_0::InstPrefetch}"/>
			<stat name="store_instructions" value="REPLACE{stats.system.cpu.iq.FU_type_0::MemWrite}"/>
			<stat name="committed_instructions" value="REPLACE{stats.system.cpu.commit.committedOps}"/>
			<stat name="committed_int_instructions" value="REPLACE{stats.system.cpu.commit.int_insts}"/>
			<stat name="committed_fp_instructions" value="REPLACE{stats.system.cpu.commit.fp_insts}"/>
			<stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogeneous -->
			<!-- the following cycle stats are used for heterogeneous cores only, 
				please ignore them if homogeneous cores -->
			<stat name="total_cycles" value="REPLACE{stats.system.cpu.numCycles}"/>
		    <stat name="idle_cycles" value="REPLACE{stats.system.cpu.idleCycles}"/>
		    <stat name="busy_cycles"  value="REPLACE{stats.system.cpu.numCycles - stats.system.cpu.idleCycles}"/>
			<!-- instruction buffer stats -->
			<!-- ROB stats, both RS and Phy based OoOs have ROB
			performance simulator should capture the difference on accesses,
			otherwise, McPAT has to guess based on number of committed instructions. -->
			<stat name="ROB_reads" value="REPLACE{stats.system.cpu.rob.rob_reads}"/>
			<stat name="ROB_writes" value="REPLACE{stats.system.cpu.rob.rob_writes}"/>
			<!-- RAT accesses -->
			<stat name="rename_reads" value="REPLACE{stats.system.cpu.rename.int_rename_lookups}"/> <!--lookup in renaming logic -->
			<stat name="rename_writes" value="REPLACE{int(stats.system.cpu.rename.RenamedOperands * stats.system.cpu.rename.int_rename_lookups / stats.system.cpu.rename.RenameLookups)}"/><!--update dest regs. renaming logic -->
			<stat name="fp_rename_reads" value="REPLACE{stats.system.cpu.rename.fp_rename_lookups}"/>
			<stat name="fp_rename_writes" value="REPLACE{int(stats.system.cpu.rename.RenamedOperands * stats.system.cpu.rename.fp_rename_lookups / stats.system.cpu.rename.RenameLookups)}"/>
			<!-- decode and rename stage use this, should be total ic - nop -->
			<!-- Inst window stats -->
			<stat name="inst_window_reads" value="REPLACE{stats.system.cpu.iq.int_inst_queue_reads}"/>
			<stat name="inst_window_writes" value="REPLACE{stats.system.cpu.iq.int_inst_queue_writes}"/>
			<stat name="inst_window_wakeup_accesses" value="REPLACE{stats.system.cpu.iq.int_inst_queue_wakeup_accesses}"/>
			<stat name="fp_inst_window_reads" value="REPLACE{stats.system.cpu.iq.fp_inst_queue_reads}"/>
			<stat name="fp_inst_window_writes" value="REPLACE{stats.system.cpu.iq.fp_inst_queue_writes}"/>
			<stat name="fp_inst_window_wakeup_accesses" value="REPLACE{stats.system.cpu.iq.fp_inst_queue_wakeup_accesses}"/>
			<!--  RF accesses -->
			<stat name="int_regfile_reads" value="REPLACE{stats.system.cpu.int_regfile_reads}"/>
			<stat name="float_regfile_reads" value="REPLACE{stats.system.cpu.fp_regfile_reads}"/>
			<stat name="int_regfile_writes" value="REPLACE{stats.system.cpu.int_regfile_writes}"/>
			<stat name="float_regfile_writes" value="REPLACE{stats.system.cpu.fp_regfile_writes}"/>
			<!-- accesses to the working reg -->
			<stat name="function_calls" value="REPLACE{stats.system.cpu.commit.function_calls}"/>
			<stat name="context_switches" value="REPLACE{stats.system.cpu.workload.num_syscalls}"/>
			<!-- Number of Windows switches (number of function calls and returns)-->
			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
			<stat name="ialu_accesses" value="REPLACE{stats.system.cpu.iq.int_alu_accesses}"/>			
			<stat name="fpu_accesses" value="REPLACE{stats.system.cpu.iq.fp_alu_accesses}"/>
			<stat name="mul_accesses" value="0"/>
			<stat name="cdb_alu_accesses" value="REPLACE{stats.system.cpu.iq.int_alu_accesses}"/>
			<stat name="cdb_mul_accesses" value="0"/>
			<stat name="cdb_fpu_accesses" value="REPLACE{stats.system.cpu.iq.fp_alu_accesses}"/>
			<!-- multiple cycle accesses should be counted multiple times, 
			otherwise, McPAT can use internal counter for different floating point instructions 
			to get final accesses. But that needs detailed info for floating point inst mix -->
			<!--  currently the performance simulator should 
			make sure all the numbers are final numbers, 
			including the explicit read/write accesses, 
			and the implicit accesses such as replacements and etc.
			Future versions of McPAT may be able to reason the implicit access
			based on param and stats of last level cache
			The same rule applies to all cache access stats too!  -->
			<!-- following is AF for max power computation. 
				Do not change them, unless you understand them-->
			<stat name="IFU_duty_cycle" value="0.9"/>
			<stat name="BR_duty_cycle" value="0.72"/><!--branch-->			
			<stat name="LSU_duty_cycle" value="0.71"/>
			<stat name="MemManU_I_duty_cycle" value="0.9"/>
			<stat name="MemManU_D_duty_cycle" value="0.71"/>
			<stat name="ALU_duty_cycle" value="0.76"/>
			<!-- (.78*2+.71)/3 -->
			<stat name="MUL_duty_cycle" value="0.82"/>
			<stat name="FPU_duty_cycle" value="0.0"/>
			<stat name="ALU_cdb_duty_cycle" value="0.76"/>
			<stat name="MUL_cdb_duty_cycle" value="0.82"/>
			<stat name="FPU_cdb_duty_cycle" value="0.0"/>
			<param name="number_of_BPT" value="2"/>
			<component id="system.core0.predictor" name="PBT">
				<!-- branch predictor; tournament predictor see Alpha implementation -->
				<param name="local_predictor_size" value="10,3"/>
				<param name="local_predictor_entries" value="4"/>
				<param name="global_predictor_entries" value="4096"/>
				<param name="global_predictor_bits" value="2"/>
				<param name="chooser_predictor_entries" value="4096"/>
				<param name="chooser_predictor_bits" value="2"/>
				<!-- These parameters can be combined like below in next version
				<param name="load_predictor" value="10,3,1024"/>
				<param name="global_predictor" value="4096,2"/>
				<param name="predictor_chooser" value="4096,2"/>
				-->
			</component>
			<component id="system.core0.itlb" name="itlb">
				<param name="number_entries" value="REPLACE{config.system.cpu.itb.size}"/>
				<stat name="total_accesses" value="REPLACE{stats.system.cpu.itb_walker_cache.tags.data_accesses}"/>
				<stat name="total_misses" value="REPLACE{stats.system.cpu.itb_walker_cache.tags.data_accesses}"/>
				<stat name="conflicts" value="0"/>	
				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
				which is actually a replacement -->
			</component>
			<component id="system.core0.icache" name="icache">
				<!-- there is no write requests to itlb although writes happen to it after miss, 
				which is actually a replacement -->
				<param name="icache_config" value="REPLACE{config.system.cpu.dcache.size,config.system.cpu.dcache.tags.block_size,config.system.cpu.dcache.assoc,1,1,config.system.cpu.dcache.hit_latency,config.system.cpu.dcache.tags.block_size,0}"/>
				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy,  -->
				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
				<param name="buffer_sizes" value="REPLACE{config.system.cpu.dcache.mshrs,config.system.cpu.dcache.mshrs,config.system.cpu.dcache.mshrs,config.system.cpu.dcache.mshrs}"/>
				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
				<stat name="read_accesses" value="REPLACE{stats.system.cpu.icache.ReadReq_accesses::total}"/>
				<stat name="read_misses" value="REPLACE{stats.system.cpu.icache.ReadReq_misses::total}"/>
				<stat name="conflicts" value="REPLACE{stats.system.cpu.icache.tags.replacements}"/>				
			</component>
			<component id="system.core0.dtlb" name="dtlb">
				<param name="number_entries" value="REPLACE{config.system.cpu.dtb.size}"/><!--dual threads-->
				<stat name="total_accesses" value="REPLACE{stats.system.cpu.dtb_walker_cache.tags.data_accesses}"/>
				<stat name="total_misses" value="REPLACE{stats.system.cpu.dtb_walker_cache.tags.data_accesses}"/>
				<stat name="conflicts" value="0"/>	
			</component>
			<component id="system.core0.dcache" name="dcache">
			        <!-- all the buffer related are optional -->
				<param name="dcache_config" value="REPLACE{config.system.cpu.dcache.size,config.system.cpu.dcache.tags.block_size,config.system.cpu.dcache.assoc,1,1,config.system.cpu.dcache.hit_latency,config.system.cpu.dcache.tags.block_size,0}"/>
				<param name="buffer_sizes" value="REPLACE{config.system.cpu.dcache.mshrs,config.system.cpu.dcache.mshrs,config.system.cpu.dcache.mshrs,config.system.cpu.dcache.mshrs}"/>
				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
				<stat name="read_accesses" value="REPLACE{stats.system.cpu.dcache.ReadReq_accesses::total}"/>
				<stat name="write_accesses" value="REPLACE{stats.system.cpu.dcache.WriteReq_accesses::total}"/>
				<stat name="read_misses" value="REPLACE{stats.system.cpu.dcache.ReadReq_misses::total}"/>
				<stat name="write_misses" value="REPLACE{stats.system.cpu.dcache.WriteReq_misses::total}"/>
				<stat name="conflicts" value="REPLACE{stats.system.cpu.dcache.tags.replacements}"/>	
			</component>
			<param name="number_of_BTB" value="2"/>
			<component id="system.core0.BTB" name="BTB">
			        <!-- all the buffer related are optional -->
				<param name="BTB_config" value="4096,4,2, 2, 1,1"/> 
				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
				<stat name="read_accesses" value="REPLACE{stats.system.cpu.branchPred.BTBLookups}"/> <!--See IFU code for guideline -->
				<stat name="write_accesses" value="REPLACE{stats.system.cpu.commit.branches}"/>
			</component>
	</component>
		<component id="system.L1Directory0" name="L1Directory0">
				<param name="Directory_type" value="0"/>
			    <!--0 cam based shadowed tag. 1 directory cache -->	
				<param name="Dir_config" value="2048,1,0,1, 4, 4, 8"/>
				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
				<!-- all the buffer related are optional -->
			    <param name="clockrate" value="2000"/>
				<param name="ports" value="1,1,1"/>
				<!-- number of r, w, and rw search ports -->
				<param name="device_type" value="2"/>
				<!-- although there are multiple access types, 
				Performance simulator needs to cast them into reads or writes
				e.g. the invalidates can be considered as writes -->
				<stat name="read_accesses" value="800000"/>
				<stat name="write_accesses" value="27276"/>
				<stat name="read_misses" value="1632"/>
				<stat name="write_misses" value="183"/>
				<stat name="conflicts" value="20"/>	
			    <stat name="duty_cycle" value="0.1"/>
		</component>
		<component id="system.L2Directory0" name="L2Directory0">
				<param name="Directory_type" value="1"/>
			    <!--0 cam based shadowed tag. 1 directory cache -->	
				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
				<!-- all the buffer related are optional -->
			    <param name="clockrate" value="3400"/>
				<param name="ports" value="1,1,1"/>
				<!-- number of r, w, and rw search ports -->
				<param name="device_type" value="0"/>
				<!-- although there are multiple access types, 
				Performance simulator needs to cast them into reads or writes
				e.g. the invalidates can be considered as writes -->
				<stat name="read_accesses" value="58824"/>
				<stat name="write_accesses" value="27276"/>
				<stat name="read_misses" value="1632"/>
				<stat name="write_misses" value="183"/>
				<stat name="conflicts" value="100"/>
			    <stat name="duty_cycle" value="0.1"/>	
		</component>
		<component id="system.L20" name="L20">
			<!-- all the buffer related are optional -->
				<param name="L2_config" value="REPLACE{config.system.l2.size,config.system.l2.tags.block_size,config.system.l2.assoc,1,1,config.system.l2.hit_latency,config.system.l2.tags.block_size,1}"/> 
				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
				<param name="buffer_sizes" value="REPLACE{config.system.l2.mshrs,config.system.l2.mshrs,config.system.l2.mshrs,config.system.l2.mshrs}"/>
				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
				<param name="clockrate" value="REPLACE{config.system.clk_domain.clock"/>
				<param name="ports" value="1,1,1"/>
				<!-- number of r, w, and rw ports -->
				<param name="device_type" value="0"/>
				<stat name="read_accesses" value="REPLACE{stats.system.l2.ReadReq_accesses::total}"/>
				<stat name="write_accesses" value="REPLACE{stats.system.l2.ReadExReq_accesses::total}"/>
				<stat name="read_misses" value="REPLACE{stats.system.l2.ReadReq_misses::total}"/>
				<stat name="write_misses" value="REPLACE{stats.system.l2.ReadExReq_misses::total}"/>
				<stat name="conflicts" value="REPLACE{stats.system.l2.tags.replacements}"/>	
			    <stat name="duty_cycle" value="1.0"/>	
		</component>
		
<!--**********************************************************************-->
<component id="system.L30" name="L30">
				<param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
				<param name="clockrate" value="800"/>
				<param name="ports" value="1,1,1"/>
				<!-- number of r, w, and rw ports -->
				<param name="device_type" value="0"/>
				<param name="buffer_sizes" value="16, 16, 16, 16"/>
				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
				<stat name="read_accesses" value="11824"/>
				<stat name="write_accesses" value="11276"/>
				<stat name="read_misses" value="1632"/>
				<stat name="write_misses" value="183"/>
				<stat name="conflicts" value="0"/>	
				<stat name="duty_cycle" value="1.0"/>	
		</component>
<!--**********************************************************************-->
		<component id="system.NoC0" name="noc0">
			<param name="clockrate" value="2000"/>
			<param name="type" value="0"/>
			<!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
				at each time only one node can send req -->
			<param name="horizontal_nodes" value="1"/>
			<param name="vertical_nodes" value="1"/>
			<param name="has_global_link" value="0"/>
			<!-- 1 has global link, 0 does not have global link -->
			<param name="link_throughput" value="1"/><!--w.r.t clock -->
			<param name="link_latency" value="1"/><!--w.r.t clock -->
			<!-- throughput >= latency -->
			<!-- Router architecture -->
			<param name="input_ports" value="1"/>
			<param name="output_ports" value="1"/>
			<!-- For bus the I/O ports should be 1 -->
			<param name="flit_bits" value="64"/>
			<param name="chip_coverage" value="1"/>
			<!-- When multiple NOC present, one NOC will cover part of the whole chip. 
				chip_coverage <=1 -->
			<param name="link_routing_over_percentage" value="0.5"/>
			<!-- Links can route over other components or occupy whole area.
				by default, 50% of the NoC global links routes over other 
				components -->
			<stat name="total_accesses" value="REPLACE{stats.system.l2.demand_accesses::total * 2}"/>
			<!-- This is the number of total accesses within the whole network not for each router -->
			<stat name="duty_cycle" value="0.2"/>
		</component>		
<!--**********************************************************************-->
		<component id="system.mc" name="mc">
			<!-- Memory controllers are for DDR(2,3...) DIMMs -->
			<!-- current version of McPAT uses published values for base parameters of memory controller
			improvements on MC will be added in later versions. -->
			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
			<param name="mc_clock" value="400"/><!--MHz-->
			<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
			<param name="block_size" value="64"/><!--(B) the block size of last level cache, which is the unit for one memory burst transfer -->
			<param name="number_mcs" value="0"/>
			<!-- current McPAT only supports homogeneous memory controllers -->
			<param name="memory_channels_per_mc" value="1"/>
			<param name="number_ranks" value="0"/>
			<!-- # of ranks of each channel-->
			<param name="req_window_size_per_channel" value="32"/>
			<param name="IO_buffer_size_per_channel" value="32"/>
			<param name="databus_width" value="128"/>
			<param name="addressbus_width" value="51"/>
			<!-- McPAT will add the control bus width to the address bus width automatically -->
			<stat name="memory_accesses" value="REPLACE{stats.system.l2.overall_misses::total}"/>
			<stat name="memory_reads" value="REPLACE{stats.system.l2.ReadReq_misses::total}"/>
			<stat name="memory_writes" value="REPLACE{stats.system.l2.ReadExReq_misses::total}"/>
			<param name="withPHY" value="1"/>
			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
			the average power per MC or per channel. This is sufficient for most application. 
			Further trackdown can be easily added in later versions. -->  			
		</component>
<!--**********************************************************************-->
		<component id="system.niu" name="niu">
			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
			<param name="clockrate" value="350"/>
			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achieved load to total achieve-able bandwidth  -->
			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
			the average power per nic or per channel. This is sufficient for most application. -->  			
		</component>
<!--**********************************************************************-->
		<component id="system.pcie" name="pcie">
			<!-- On chip PCIe controller, including Phy-->
			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
			<param name="withPHY" value="1"/>
			<param name="clockrate" value="350"/>
			<param name="number_units" value="0"/>
			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achieved load to total achieve-able bandwidth  -->
			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
			the average power per pcie controller or per channel. This is sufficient for most application. -->  			
		</component>
<!--**********************************************************************-->
		<component id="system.flashc" name="flashc">
		    <param name="number_flashcs" value="0"/>
			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
            <param name="withPHY" value="1"/>
			<param name="peak_transfer_rate" value="200"/><!--Per controller sustain-able peak rate MB/S -->
			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achieved load to total achieve-able bandwidth  -->
			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
			the average power per fc or per channel. This is sufficient for most application -->  			
		</component>
<!--**********************************************************************-->

		</component>
</component>