This discussion has been locked.

You can no longer post new replies to this discussion. If you have a question you can start a new discussion

Partial register dependency neon

I'm having trouble finding any informations on partial neon register dependencies.

Take for example the following code:

ld2 {v0.16b, v1.16b}[0], [x0]
ld2 {v0.16b, v1.16b}[1], [x1]
ld2 {v0.16b, v1.16b}[2], [x2]
...

Does the second load have to wait for the previous one to complete or may it continue right away?

I'm working with image data that needs to be palletised from a 256 16-bit entry table and I want to further process it with neon. Unfortunately due to the table size are tbl instructions not an option, since it would take up all of the 32 registers. Would doing the look up with arm first, then combining and transfering the results in 4 64-bit registers be faster?

If it helps I'm targeting Cortex-A57.

Parents

0 doofenstein over 4 years ago

I did some more elaborate testing to write a better proper for anyone else having this question.

So here's the control test:

u64 tiledata[100];
    
// warm up cache
asm volatile ("" : : "r" (tiledata[0]) : );

u64 pmcr_el0;
asm volatile ("mrs %0, pmcr_el0" : "=r" (pmcr_el0));
pmcr_el0 &= 0x20;
pmcr_el0 |= 0x5;
asm volatile ("msr pmcr_el0, %0" : : "r" (pmcr_el0));
u64 pmcntenset_el0 = 0x80000000;
asm volatile ("msr pmcntenset_el0, %0" : : "r" (pmcntenset_el0));
u64 pmccfiltr_el0 = 0x10000000;
asm volatile ("msr pmccfiltr_el0, %0" : : "r" (pmcntenset_el0));

u64 i = 0;
asm volatile
(
"loopStart%=:\n"
    "ld1 {v0.s}[0], [%[ptr0]]\n"
    "ld1 {v1.s}[1], [%[ptr1]]\n"
    "ld1 {v2.s}[2], [%[ptr2]]\n"
    "ld1 {v3.s}[3], [%[ptr3]]\n"

    "ld1 {v4.s}[0], [%[ptr0]]\n"
    "ld1 {v5.s}[1], [%[ptr1]]\n"
    "ld1 {v6.s}[2], [%[ptr2]]\n"
    "ld1 {v7.s}[3], [%[ptr3]]\n"

    "add %[i], %[i], #1\n"
    "cmp %[i], #1000\n"
    "blt loopStart%=\n"
    :
        [i] "+r" (i)
    :
        [ptr0] "r" (&tiledata[0]), [ptr1] "r" (&tiledata[1]), [ptr2] "r" (&tiledata[2]), [ptr3] "r" (&tiledata[3])
    :
        "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
);

u64 cycles = 42;
asm volatile ("mrs %0, pmccntr_el0" : "=r" (cycles));
printf("cycles %ld\n", cycles);

Here's the experiment:

u64 tiledata[100];
    
// warm up cache
asm volatile ("" : : "r" (tiledata[0]) : );

u64 pmcr_el0;
asm volatile ("mrs %0, pmcr_el0" : "=r" (pmcr_el0));
pmcr_el0 &= 0x20;
pmcr_el0 |= 0x5;
asm volatile ("msr pmcr_el0, %0" : : "r" (pmcr_el0));
u64 pmcntenset_el0 = 0x80000000;
asm volatile ("msr pmcntenset_el0, %0" : : "r" (pmcntenset_el0));
u64 pmccfiltr_el0 = 0x10000000;
asm volatile ("msr pmccfiltr_el0, %0" : : "r" (pmcntenset_el0));

u64 i = 0;
asm volatile
(
"loopStart%=:\n"
    "ld1 {v0.s}[0], [%[ptr0]]\n"
    "ld1 {v0.s}[1], [%[ptr1]]\n"
    "ld1 {v0.s}[2], [%[ptr2]]\n"
    "ld1 {v0.s}[3], [%[ptr3]]\n"

    "ld1 {v0.s}[0], [%[ptr0]]\n"
    "ld1 {v0.s}[1], [%[ptr1]]\n"
    "ld1 {v0.s}[2], [%[ptr2]]\n"
    "ld1 {v0.s}[3], [%[ptr3]]\n"

    "add %[i], %[i], #1\n"
    "cmp %[i], #1000\n"
    "blt loopStart%=\n"
    :
        [i] "+r" (i)
    :
        [ptr0] "r" (&tiledata[0]), [ptr1] "r" (&tiledata[1]), [ptr2] "r" (&tiledata[2]), [ptr3] "r" (&tiledata[3])
    :
        "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
);

u64 cycles = 42;
asm volatile ("mrs %0, pmccntr_el0" : "=r" (cycles));
    printf("cycles %ld\n", cycles);

As mentioned in the original post I'm using a Cortex-A57 and I'm getting the same result for both tests of about 8000 cycles. So this seems to confirm the hypothesis that register dependencies are tracked at element level and not at register level.

If someone sees an error in my methology please tell me, otherwise I'll set this as the answer.

Reply

0 doofenstein over 4 years ago

I did some more elaborate testing to write a better proper for anyone else having this question.

So here's the control test:

u64 tiledata[100];
    
// warm up cache
asm volatile ("" : : "r" (tiledata[0]) : );

u64 pmcr_el0;
asm volatile ("mrs %0, pmcr_el0" : "=r" (pmcr_el0));
pmcr_el0 &= 0x20;
pmcr_el0 |= 0x5;
asm volatile ("msr pmcr_el0, %0" : : "r" (pmcr_el0));
u64 pmcntenset_el0 = 0x80000000;
asm volatile ("msr pmcntenset_el0, %0" : : "r" (pmcntenset_el0));
u64 pmccfiltr_el0 = 0x10000000;
asm volatile ("msr pmccfiltr_el0, %0" : : "r" (pmcntenset_el0));

u64 i = 0;
asm volatile
(
"loopStart%=:\n"
    "ld1 {v0.s}[0], [%[ptr0]]\n"
    "ld1 {v1.s}[1], [%[ptr1]]\n"
    "ld1 {v2.s}[2], [%[ptr2]]\n"
    "ld1 {v3.s}[3], [%[ptr3]]\n"

    "ld1 {v4.s}[0], [%[ptr0]]\n"
    "ld1 {v5.s}[1], [%[ptr1]]\n"
    "ld1 {v6.s}[2], [%[ptr2]]\n"
    "ld1 {v7.s}[3], [%[ptr3]]\n"

    "add %[i], %[i], #1\n"
    "cmp %[i], #1000\n"
    "blt loopStart%=\n"
    :
        [i] "+r" (i)
    :
        [ptr0] "r" (&tiledata[0]), [ptr1] "r" (&tiledata[1]), [ptr2] "r" (&tiledata[2]), [ptr3] "r" (&tiledata[3])
    :
        "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
);

u64 cycles = 42;
asm volatile ("mrs %0, pmccntr_el0" : "=r" (cycles));
printf("cycles %ld\n", cycles);

Here's the experiment:

u64 tiledata[100];
    
// warm up cache
asm volatile ("" : : "r" (tiledata[0]) : );

u64 pmcr_el0;
asm volatile ("mrs %0, pmcr_el0" : "=r" (pmcr_el0));
pmcr_el0 &= 0x20;
pmcr_el0 |= 0x5;
asm volatile ("msr pmcr_el0, %0" : : "r" (pmcr_el0));
u64 pmcntenset_el0 = 0x80000000;
asm volatile ("msr pmcntenset_el0, %0" : : "r" (pmcntenset_el0));
u64 pmccfiltr_el0 = 0x10000000;
asm volatile ("msr pmccfiltr_el0, %0" : : "r" (pmcntenset_el0));

u64 i = 0;
asm volatile
(
"loopStart%=:\n"
    "ld1 {v0.s}[0], [%[ptr0]]\n"
    "ld1 {v0.s}[1], [%[ptr1]]\n"
    "ld1 {v0.s}[2], [%[ptr2]]\n"
    "ld1 {v0.s}[3], [%[ptr3]]\n"

    "ld1 {v0.s}[0], [%[ptr0]]\n"
    "ld1 {v0.s}[1], [%[ptr1]]\n"
    "ld1 {v0.s}[2], [%[ptr2]]\n"
    "ld1 {v0.s}[3], [%[ptr3]]\n"

    "add %[i], %[i], #1\n"
    "cmp %[i], #1000\n"
    "blt loopStart%=\n"
    :
        [i] "+r" (i)
    :
        [ptr0] "r" (&tiledata[0]), [ptr1] "r" (&tiledata[1]), [ptr2] "r" (&tiledata[2]), [ptr3] "r" (&tiledata[3])
    :
        "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
);

u64 cycles = 42;
asm volatile ("mrs %0, pmccntr_el0" : "=r" (cycles));
    printf("cycles %ld\n", cycles);

If someone sees an error in my methology please tell me, otherwise I'll set this as the answer.

Children

No data