This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

Cortex A8 Instruction Cycle Timing

Note: This was originally posted on 17th March 2011 at http://forums.arm.com

Hi) sorry for bad English

I need to count latency for two instruction, and all I have is the arm cortex A 8 documantation(charter 16) !
but I have no idea how can do this work using that documantation(
Parents
  • Note: This was originally posted on 9th August 2011 at http://forums.arm.com


    What is your test procedure?
    You have made a loop executed 1000 times (for example) and you have found 46.000 cycles for the first example
    and (11 + 9) * 1000 = 20.000 cycles for the second?


    Hi Etienne,
    That's true. I have a loop executed 1000 times and I am getting 46,000 cycles for the first example and 21 for the second example.
    I have given the whole function for your reference. r0 has the loop count and r1 the input buffer pointer.

    First example:

    [indent][indent].text;
    .align 4;
    .global vmlaq_vld_f32_interleaved;
    .type vmlaq_vld_f32_interleaved,%function;

    vmlaq_vld_f32_interleaved:

    core_loop_beg6:
    vld1.32 {d16,d17},[r1:128];
    vmla.f32 d0,d15,d14;
    vld1.32 {d18,d19},[r1:128];
    vmla.f32 d1,d15,d14;
    vld1.32 {d20,d21},[r1:128];
    vmla.f32 d2,d15,d14;
    vld1.32 {d22,d23},[r1:128];
    vmla.f32 d3,d15,d14;
    vld1.32 {d24,d25},[r1:128];
    vmla.f32 d4,d15,d14;
    vld1.32 {d26,d27},[r1:128];
    vmla.f32 d5,d15,d14;
    vld1.32 {d28,d29},[r1:128];
    vmla.f32 d6,d15,d14;
    vld1.32 {d30,d31},[r1:128];
    vmla.f32 d7,d15,d14;
    vld1.32 {d10,d11},[r1:128];
    vmla.f32 d8,d15,d14;
    subs r0,r0,#1;
    bgt core_loop_beg6;
    core_loop_end6:
      BX   lr;
    [/indent][/indent]
    Second example.
    [indent][indent].text;
    .align 4;
    .global vld1_aligned;
    .type vld1_aligned,%function;

    vld1_aligned:

    core_loop_beg:

    vmla.f32 d0,d15,d14;
    vmla.f32 d1,d15,d14;
    vmla.f32 d2,d15,d14;
    vmla.f32 d3,d15,d14;
    vmla.f32 d4,d15,d14;
    vmla.f32 d5,d15,d14;
    vmla.f32 d6,d15,d14;
    vmla.f32 d7,d15,d14;
    vmla.f32 d8,d15,d14;

    vld1.32 {d16,d17},[r1:128];
    vld1.32 {d18,d19},[r1:128];
    vld1.32 {d20,d21},[r1:128];
    vld1.32 {d22,d23},[r1:128];
    vld1.32 {d24,d25},[r1:128];
    vld1.32 {d26,d27},[r1:128];
    vld1.32 {d28,d29},[r1:128];
    vld1.32 {d30,d31},[r1:128];
    vld1.32 {d12,d13},[r1:128];

    subs r0,r0,#1;
    bgt core_loop_beg;
    core_loop_end:
      BX   lr;
    [/indent][/indent]Regards,
    Anil M S
Reply
  • Note: This was originally posted on 9th August 2011 at http://forums.arm.com


    What is your test procedure?
    You have made a loop executed 1000 times (for example) and you have found 46.000 cycles for the first example
    and (11 + 9) * 1000 = 20.000 cycles for the second?


    Hi Etienne,
    That's true. I have a loop executed 1000 times and I am getting 46,000 cycles for the first example and 21 for the second example.
    I have given the whole function for your reference. r0 has the loop count and r1 the input buffer pointer.

    First example:

    [indent][indent].text;
    .align 4;
    .global vmlaq_vld_f32_interleaved;
    .type vmlaq_vld_f32_interleaved,%function;

    vmlaq_vld_f32_interleaved:

    core_loop_beg6:
    vld1.32 {d16,d17},[r1:128];
    vmla.f32 d0,d15,d14;
    vld1.32 {d18,d19},[r1:128];
    vmla.f32 d1,d15,d14;
    vld1.32 {d20,d21},[r1:128];
    vmla.f32 d2,d15,d14;
    vld1.32 {d22,d23},[r1:128];
    vmla.f32 d3,d15,d14;
    vld1.32 {d24,d25},[r1:128];
    vmla.f32 d4,d15,d14;
    vld1.32 {d26,d27},[r1:128];
    vmla.f32 d5,d15,d14;
    vld1.32 {d28,d29},[r1:128];
    vmla.f32 d6,d15,d14;
    vld1.32 {d30,d31},[r1:128];
    vmla.f32 d7,d15,d14;
    vld1.32 {d10,d11},[r1:128];
    vmla.f32 d8,d15,d14;
    subs r0,r0,#1;
    bgt core_loop_beg6;
    core_loop_end6:
      BX   lr;
    [/indent][/indent]
    Second example.
    [indent][indent].text;
    .align 4;
    .global vld1_aligned;
    .type vld1_aligned,%function;

    vld1_aligned:

    core_loop_beg:

    vmla.f32 d0,d15,d14;
    vmla.f32 d1,d15,d14;
    vmla.f32 d2,d15,d14;
    vmla.f32 d3,d15,d14;
    vmla.f32 d4,d15,d14;
    vmla.f32 d5,d15,d14;
    vmla.f32 d6,d15,d14;
    vmla.f32 d7,d15,d14;
    vmla.f32 d8,d15,d14;

    vld1.32 {d16,d17},[r1:128];
    vld1.32 {d18,d19},[r1:128];
    vld1.32 {d20,d21},[r1:128];
    vld1.32 {d22,d23},[r1:128];
    vld1.32 {d24,d25},[r1:128];
    vld1.32 {d26,d27},[r1:128];
    vld1.32 {d28,d29},[r1:128];
    vld1.32 {d30,d31},[r1:128];
    vld1.32 {d12,d13},[r1:128];

    subs r0,r0,#1;
    bgt core_loop_beg;
    core_loop_end:
      BX   lr;
    [/indent][/indent]Regards,
    Anil M S
Children
No data