Hi at all!
I'm working with TI DM3730 (CortexA8 inside) and an external mobile DDR-SDRAM.
The startup initialize MMU, L1 Cache and L2 Cache and Flow Prediction.
Tests with about 256MiB of data show some data loss when L2 Cache is enabled.
If L2 is disabled and only L1 I and D Cache is enabled, the data tests work properly.
Here is my MMU-Setting:
Device-Registers: AP = 3, mode = 2, TEX = 0, C = 0, B = 0
internal RAM: AP = 3, TEX = 0, C = 1, B = 0
external RAM: AP = 3, TEX = 0, C = 1, B = 0
all areas are mapped to domain 0 and all domains are set to client mode.
If the MMU-Settings for external RAM are set to TEX = 0, C = 0 and B = 0 (disable caching for external RAM), there is also no data loss and all work fine.
Anyone an idea?
Here is the complete code of initialization:
/**************************************************** * disable 'Instruction Cache' , 'Flow Prediction' * disable 'Data Cache' , 'MMU' * enable 'Alignment check' ****************************************************/ mrc p15, 0, r0, c1, c0, 1 // read auxiliary control register bic r0, r0, #(0x1 << 1) // disable L2-Cache mcr p15, 0, r0, c1, c0, 1 // write back mrc p15, 0, r0, c1, c0, 0 // read control register bic r0, r0, #(0x1 << 0) // disable MMU orr r0, r0, #(0x1 << 1) // enable Strict alignment fault checking bic r0, r0, #(0x1 << 2) // disable D-Cache bic r0, r0, #(0x1 << 11) // disable Flow prediction bic r0, r0, #(0x1 << 12) // disable I-Cache bic r0, r0, #(0x1 << 13) // Use Normal exception vector mcr p15, 0, r0, c1, c0, 0 // write back /**************************************************** * set 'Vectore Base Address' ****************************************************/ ldr r0, =dm3730_initVector mcr p15, 0, r0, c12, c0, 0 /**************************************************** * Invalidate 'Translation Table Base' (TLB) * Invalidate 'Instruction Cache' und * lösche 'branch target cache' ****************************************************/ mov r0, #0 mcr p15, 0, r0, c8, c7, 0 // Invalidate Inst-TLB und Data-TLB mcr p15, 0, r0, c7, c5, 0 // Invalidate all instruction caches and flushes branch target cache /**************************************************** * Cache Invalidation code -> * github.com/.../startup.s ****************************************************/ mrc p15, 1, r0, c0, c0, 1 // Read CLIDR ands r3, r0, #0x07000000 // Extract coherency level mov r3, r3, lsr #23 // Total cache levels << 1 beq processor_FirstBootEntry_cacheExit // If 0, no need to clean mov r8, #0 // R8 holds current cache level << 1 processor_FirstBootEntry_cacheLoop1: add r2, r8, r8, lsr #1 // R2 holds cache "Set" position mov r1, r0, lsr r2 // Bottom 3 bits are the Cache-type for this level and r1, r1, #7 // Isolate those lower 3 bits cmp r1, #2 blt processor_FirstBootEntry_cacheSkip // No cache or only instruction cache at this level mcr p15, 2, r8, c0, c0, 0 // Write the Cache Size selection register ISB // ISB to sync the change to the CacheSizeID reg mrc p15, 1, r1, c0, c0, 0 // Reads current Cache Size ID register and r2, r1, #7 // Extract the line length field add r2, r2, #4 // Add 4 for the line length offset (log2 16 bytes) movw r4, #0x3ff ands r4, r4, r1, LSR #3 // R4 is the max number on the way size (right aligned) clz r5, r4 // R5 is the bit position of the way size increment movw r6, #0x7FFF ands r6, r6, r1, LSR #13 // R6 is the max number of the index size (right aligned) processor_FirstBootEntry_cacheLoop2: mov r7, r4 // R7 working copy of the max way size (right aligned) processor_FirstBootEntry_cacheLoop3: orr r1, r8, r7, lsl r5 // Factor in the Way number and cache number into R1 orr r1, r1, r6, lsl r2 // Factor in the Set number mcr p15, 0, r1, c7, c6, 2 // Invalidate by Set/Way subs r7, r7, #1 // Decrement the Way number bge processor_FirstBootEntry_cacheLoop3 subs r6, r6, #1 // Decrement the Set number bge processor_FirstBootEntry_cacheLoop2 processor_FirstBootEntry_cacheSkip: add r8, r8, #2 // increment the cache number cmp r3, r8 bgt processor_FirstBootEntry_cacheLoop1 processor_FirstBootEntry_cacheExit: DSB /**************************************************** * MMU Settings ****************************************************/ // Hier Code kopieren vom Cache-Handling /* invalidate TLB */ mov r0, #0 mcr p15, 0, r0, c8, c7, 0 // set TLB for instruction and data invalid /* TLB Addr setzen */ mov r0, #0 mcr p15, 0, r0, c2, c0, 2 // set Table-Base-Control register to 0 ldr r0, =mmuL1PageTable // load addr of MMU-L1-PageTable mcr p15, 0, r0, c2, c0, 0 // set addr of MMU-L1-PageTable /**************************************************** * Setup domain control register - * Enable all domains to client mode ****************************************************/ movw r0, #0x5555 // setl all domains to client mode movt r0, #0x5555 mcr p15, 0, r0, c3, c0, 0 // write domain control register /**************************************************** * Write L2 Cache Auxiliary Control Register ****************************************************/ mov r0, #0 mcr p15, 1, r0, c9, c0, 2 #ifdef __ARM_NEON__ /**************************************************** * Enable NEON/VFP ****************************************************/ mrc p15, 0, r0, c1, c0, 2 // Read CP Access register orr r0, r0, #(0xF << 20) // Enable full access to NEON/VFP (Coprocessors 10 and 11) mcr p15, 0, r0, c1, c0, 2 // Write CP Access register mov r0, #0x40000000 // Switch on the VFP and NEON hardware vmsr FPEXC, r0 // Write FPEXC register, EN bit set #endif /**************************************************** * Enable MMU ****************************************************/ mrc p15, 0, r0, c1, c0, 0 // read Control Register orr r0, r0, #(0x1 << 0) // enable MMU mcr p15, 0, r0, c1, c0, 0 // write back /**************************************************** * Enable L2-Caches ****************************************************/ mrc p15, 0, r0, c1, c0, 1 // Lese Auxiliary Control Register orr r0, r0, #(0x1 << 1) // enable L2-Cache mcr p15, 0, r0, c1, c0, 1 // write back /**************************************************** * Enable Caches ****************************************************/ mrc p15, 0, r0, c1, c0, 0 // read Control Register orr r0, r0, #(0x1 << 12) // enable I-Cache orr r0, r0, #(0x1 << 2) // enable D-Cache orr r0, r0, #(0x1 << 11) // enable Flow prediction mcr p15, 0, r0, c1, c0, 0 // write back
I am a bit confused. The addresses should all be of words but in both your examples the addresses 0x8018B181 and 0x80C5FEB1 they are odd. The code does not include a diagnostic to output the address and data.
I'm assuming you have just not multiplied the index by four so the first address is in turn 1 which is the second megabyte with index 0x0018B181 at address 0x8062C604, inverted is 0 and that means the value should be the same as the index 0x0018B181.
If so I certainly can't see any pattern in it and it doesn't look like any system instructions that would cause trouble are used between turns. If cache lines were going wrong I would have expected the first word of the cache line to be wrong but it is at some odd word past the start. The values don't look like anything that would ever be generated by the program - but then again I'd be a bit surprised to see such values in ram which had never been initialisied since power up. That's very strange.
Oh sorry, that was my fail. you're right, I forgot to multiply the value for address by 4.
Here is a corrected example: at address 0x814F6540 value is 0xA442962F and should be 0x0053D950. Below is the updated and corrected test-code including debug-output.
So I can agree with your last point. I also can't see a pattern or something else. Also If the test is performed Even if the test is performed several times after each, the wrong data are random as described above.
Here the updated test-code:
#define myMiB(value) (value*1024*1024) const uint_32 testByte = myMiB(256); // amount of test data in MiB const uint_32 incrementSrc = myMiB(1); // increment steps in MiB volatile uint_32* globalSrcAddr = (volatile uint_32*)0x80000000; // start-address for test uint_8 inverted = 0; // working variable for altering test mode uint_32 copyData(void) { /** Variablen **/ uint_32 i; uint_32 testPattern; /** Start **/ inverted = !inverted; for(i = 0; i < (testByte >> 2); i++) { if(inverted == 0) { testPattern = i; } else { testPattern = ~i; } globalSrcAddr[i] = testPattern; } return 0; } uint_32 checkData(void) { /** Variablen **/ uint_32 returnValue = 0; uint_32 i; uint_32 testPattern; /** Start **/ for(i = 0; i < (testByte >> 2); i++) { if(inverted == 0) { testPattern = i; } else { testPattern = ~i; } if(globalSrcAddr[i] != testPattern) { returnValue = i+1; cmdPut("\nERROR: i: %d (0x%08X) [addr: 0x%08X | ist: 0x%08X | soll: 0x%08X]\n", i,i , I((globalSrcAddr+i)), globalSrcAddr[i], testPattern); break; } } globalSrcAddr += (incrementSrc/4); return returnValue; }
There is perhaps a pattern in the address though there's only three there. The failures you've got have been in the first or second word after a 64 byte alignment, i.e. the address ends in 0b000x00 and the cache lines are 64 bytes long. If so I could see something happening for the first word - but why should the first word be sometimes okay but not the second, now that's strange. Anyway it does sound to me that it is the start of the transfer from the L2 cache to memory that is sometimes going wrong.
I don't suppose it will tell much - but it might be interesting to do a test where one just stores zeroes to memory and checks the result, and if that goes through stores all -1 to memory. If they do go wrong perhaps it would also make the random data that is read more meaningful.
Ok, I'm not sure if I really understand whats your idea with the pattern. But I can also see errors at 0x82247310 which is 0b00010000 (not match pattern 0b000x00) or 0x826D1090 which is 0b10010000.
I also performed the test you suggest. There are errors in both variants (0 and -1). For Example at address 0x82247310 value is 0xE40B761F and should be 0 or at address 0x803FC0C0 value is 0x2B883128 and should be -1 (0xFFFFFFFF).