This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

Data Cache Zero by Virtual Address (DC ZVA) instruction

HI Everyone,

i have been trying to test whether or not DC ZVA instruction causes an L1 or L2 cache allocation on Cortex-A73.

The ARMv8-A architecture reference manual makes no statements about whether or not the DC ZVA instruction causes allocation to any particular level of the cache and the Cortex-A73 architecture reference manual too.

So i meaaured the latency of cache access and memory access first, then i measured the latency using the DC ZVA instruction. Here my code and the result:

#define ROUND 5000

const size_t chunk_size = 1<<30;

struct timespec timer_start(){
	struct timespec start_time;
	clock_gettime(CLOCK_MONOTONIC, &start_time);

	return start_time;
}

// call this function to end a timer, returning nanoseconds elapsed as a long
long timer_end(struct timespec start_time){
	struct timespec end_time;
	clock_gettime(CLOCK_MONOTONIC, &end_time);
	long diffInNanos = (end_time.tv_sec - start_time.tv_sec) * (long)1e9 + (end_time.tv_nsec - start_time.tv_nsec);

	return diffInNanos;
}

void print_affinity() {
	cpu_set_t mask;
	long nproc, i;
	if (sched_getaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
		perror("sched_getaffinity");
		assert(false);
	}
	nproc = sysconf(_SC_NPROCESSORS_ONLN);
	printf("sched_getaffinity = ");
	for (i = 0; i < nproc; i++)
		printf("%d ", CPU_ISSET(i, &mask));
}

void bind_to_cpu (){
	cpu_set_t mask;
	print_affinity();
	printf("\n");
	printf("sched_getcpu = %d\n", sched_getcpu());
	CPU_ZERO(&mask);
	CPU_SET(0, &mask);
	if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
		perror("sched_setaffinity");
		assert(false);
	}
	print_affinity();
	printf("\nsched_getcpu = %d\n", sched_getcpu());


}



int main(){

	bind_to_cpu();
	void* memory_pool = mmap(NULL, chunk_size, PROT_READ | PROT_WRITE, MAP_POPULATE | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
	if (memory_pool == MAP_FAILED){
		fprintf(stderr, "Error: Could not map memory.\n");
		return -1;
	}		

	//initialize chunk
	memset(memory_pool, 0xff, chunk_size);
	size_t offset1 = (rand() << 12) % chunk_size;
	void* addr1 = (void*) (memory_pool+offset1);
	size_t offset2 = (rand() << 12) % chunk_size;
	void *addr2 = (void*) (memory_pool+offset2);
//------------ CACHE ACCESS TIME USING LDR INSTRUCTION ---------------
    asm volatile("DSB 0XF");
    asm volatile("ISB");
	uint32_t value=0;
	struct timespec start = timer_start();
	for (int j = 0; j < ROUND; ++j) {
		asm volatile ("ldr %0, [%1]\n\t" : "=r" (value) : "r" (addr1) );
		asm volatile ("ldr %0, [%1]\n\t" : "=r" (value) : "r" (addr2) );
	}
	long time_taken = timer_end(start);
	printf("cache access time: %ld ns\n", time_taken/ROUND);

//------------ MEMORY ACCESS TIME USING LDR + DC CIVAC INSTRUCTION --------------
    asm volatile("DSB 0XF");
    asm volatile("ISB");
	uint32_t temp=0;
	start = timer_start();
	for (int j = 0; j < ROUND; ++j) {
		asm volatile(
		"ldr %2, [%0]\n\t"
		"ldr %2, [%1]\n\t"
		"dc civac, %0\n\t"
		"dc civac, %1\n\t"
		::"r" (addr1), "r" (addr2), "r" (temp)
		);
	}
	time_taken = timer_end(start);
	printf("memory access time: %ld ns\n", time_taken/ROUND);

// ------------ ZVA INSTRUCTION --------------------------------
    asm volatile("DSB 0XF");
    asm volatile("ISB");
	asm volatile(
	"dc civac, %0\n\t"
	"dc civac, %1\n\t"
	::"r" (addr1), "r" (addr2)
	);
	start = timer_start();
	for (int j = 0; j < ROUND; ++j) {
		// use dc zva to access target rows
		asm volatile(
		"dc zva, %0\n\t"
        "dc zva, %1\n\t"
        ::"r" (addr1), "r" (addr2)
		);
    }
	time_taken = timer_end(start);
	printf("DC ZVA: %ld ns\n", time_taken/ROUND);

	return 0;
}

I got the following output:

cache access time: 4 ns
memory access time: 210 ns
DC ZVA: 21 ns

So, what do you think about it?

I am not sure if the the latency of the DC ZVA instruction is greater than cache access because the DC ZVA instruction takes more time than LDR instruction or it is performing memory access.

Thank you.

  • DC ZVA is generally as storing zero to memory. It could cause cache line allocation.

    But if you were performing memset to a large block of memory, it would trigger write streaming mode.

    There are some situations where allocating on writes is not required. For example, when executing the C standard library memset() function to clear a large block of memory to a known value. Writes of large blocks of data can pollute the cache with unnecessary data. It can also waste power and performance if a linefill must be performed only to discard the linefill data because the entire line was subsequently written by the memset.

    In write streaming mode, the store miss would not cause cache line allocation