/* Henry Wong http://blog.stuffedcow.net/2015/08/pagewalk-coherence/ 2015-08-12 */ #include #include #include #include #include #include #include #include #include #include #define PAE 0 // Sorry, no runtime detection of 32-bit PAE. Too lazy. #define DELAY_REMAP 0 /* Mode 0: Delay the remapping store to give more time for the pagewalk to be speculatively executed before the store executes, which should encourage more speculative pagewalks. In practice, doesn't seem to affect results. Mode 1: Make the remap store dependent on the imul chain, so it can't execute early. This causes the load DTLB-miss to execute before the preceding store-address operation. Can distinguish processors with non-speculative memory systems because the load (and page walk) can't pass the earlier store that got delayed (that modifies the page table). Makes latency for case 2 increase to roughly match case 3. Caution: On Intel (Core2 through Haswell), if the page walk occurs too early, there is a penalty of some sort (DTLB miss count is doubled, and it takes longer.) This appears to be the same penalty when there is a true page table coherence violation. This might hint at how page walk coherence violation detection happens... Sandy Bridge: Penalty if DTLB miss occurs > 10 cycles before store-address of preceding store. Ivy Bridge: >9 cycles Haswell: >9 cycles */ #define REPEAT2(X) X X #define REPEAT3(X) X X X #define REPEAT4(X) X X X X #define REPEAT8(X) REPEAT4(X) REPEAT4(X) #define REPEAT12(X) REPEAT4(X) REPEAT4(X) REPEAT4(X) #define REPEAT14(X) REPEAT4(X) REPEAT4(X) REPEAT4(X) REPEAT2(X) #define REPEAT16(X) REPEAT4(X) REPEAT4(X) REPEAT4(X) REPEAT4(X) #define REPEAT32(X) REPEAT16(X) REPEAT16(X) #if __x86_64__ typedef uint64_t pte_t; // 64-bit always uses 8-byte PTEs typedef uint64_t reg_t; // 64-bit integer registers typedef uint64_t paddr_t; // 52-bit physical address int pte_per_pde = 512; // 512 for PAE/64bit, 1024 for 32-bit. const int translation_levels = 4; const int translation_shifts[4] = { 12, 21, 30, 39 }; #elif __i386__ typedef uint32_t reg_t; // 32-bit integer registers #if PAE typedef uint64_t pte_t; // 32-bit PAE uses 8-byte PTEs typedef uint64_t paddr_t; // 52-bit physical address int pte_per_pde = 512; // 512 for PAE/64bit, 1024 for 32-bit. const int translation_levels = 3; const int translation_shifts[3] = { 12, 21, 30 }; #else typedef uint32_t pte_t; // 32-bit non-PAE uses 4-byte PTEs. typedef uint32_t paddr_t; // 32-bit physical address int pte_per_pde = 1024; // 512 for PAE/64bit, 1024 for 32-bit. const int translation_levels = 2; const int translation_shifts[2] = { 12, 22 }; #endif // PAE #else #error "Unknown architecture" #endif #define ITS (1024) struct s_opts { int preload; // 0 = flush TLB, 1 = page 1, 2 = page 2. unsigned int asize; // array size, in 4K pages int mode; // Which test to run. int chain_deps; // Adjust data dependencies for mode 1 bool dont_remap; // Change pointer to page table to avoid the remap. Applies to mode 0 and 1. } opts; inline uint64_t rdtsc() { uint32_t lo, hi; __asm__ volatile (".byte 0x0f, 0x31" : "=a" (lo), "=d" (hi)); return (uint64_t)(((uint64_t)hi)<<32LL) | (uint64_t) lo; } int fd_devmem = -1; void read_physical(paddr_t addr, void* buf, size_t size) { if (fd_devmem < 0) return; lseek64(fd_devmem, addr, SEEK_SET); read(fd_devmem, buf, size); } void write_physical(paddr_t addr, const void* buf, size_t size) { if (fd_devmem < 0) return; if (addr < 65536) printf ("Something seems wrong. Writing to address %p refused.\n", (void*)addr); lseek64(fd_devmem, addr, SEEK_SET); write(fd_devmem, buf, size); } void pgmod_flush_tlb() { int f = open("/proc/pgmod", O_WRONLY); write(f, "hi", 1); close(f); } // Translates virtual address to physical. // 4KB paging in 32-bit, PAE, and 64-bit modes are similar enough this can be done with // one routine and a bunch of architecture-specific typedefs. // If level > 0, stop early by "level" lookups and return the PTE. Useful for getting // a phyaddr pointer to a page table to edit it. // 0 : Do a complete translation // 1 : Stop at the final PTE. Return the final PTE // 2 : Return the PDE, one level up the tree pte_t translate(uintptr_t cr3, uintptr_t vaddr, int level=0) { // Canonical form only applies to 64-bit pointers with 48-bit virtual address space. if ((sizeof(vaddr) == 8) && (uint64_t)vaddr >> 48 != 0 && (uint64_t)vaddr >> 48 != 0xffff) { printf ("vaddr %016llx is not in canonical form\n", (uint64_t)vaddr); return 0; } pte_t pte = 0; paddr_t cur_ptr = cr3; for (int l = translation_levels-1; l >= 0; l--) { int cur_offset = (vaddr >> translation_shifts[l]) & (pte_per_pde-1); read_physical(cur_ptr + cur_offset*sizeof(pte), &pte, sizeof(pte)); if ((l != 0) && (pte & 0x80)) { printf ("Large page at level %d detected. Only 4 KB pages supported\n", l); exit(1); // Page tables not cleaned up before exit. Expect lots of log messages. return 0; } if (level == l+1) return pte; if (~pte & 1) { printf ("Page table level %d entry %016llx not present. Fail to translate %016llx\n", l, (uint64_t)pte, (uint64_t)vaddr); exit(1); return 0; } cur_ptr = pte & 0x000ffffffffff000ULL; } return cur_ptr | (vaddr&0xfff); // pte_t is always at least as big as paddr_t } // Changes a PTE. Returns the old PTE. Only for the lowest level of page tables. pte_t change_pte(uintptr_t cr3, uintptr_t vaddr, pte_t new_pte) { pte_t pde = translate(cr3, vaddr, 2) & 0x000ffffffffff000ULL; int cur_offset = (vaddr>>12) & (pte_per_pde-1); pte_t old_pte; read_physical(pde+cur_offset*sizeof(pte_t), &old_pte, sizeof(pte_t)); write_physical(pde+cur_offset*sizeof(pte_t), &new_pte, sizeof(pte_t)); //printf ("Updating PTE from %016llx to %016llx\n", (uint64_t)old_pte, (uint64_t)new_pte); return old_pte; } bool parse_opts(int argc, char *argv[]) { memset(&opts, 0, sizeof(opts)); opts.asize = 4096; opts.chain_deps = 4; // Default to imul dependent on previous imul, but the load is independent. bool fail = false; for (int i=1;i= argc) { printf ("--preload needs an argument\n"); fail = true; } else { sscanf(argv[++i], "%i", &opts.preload); } } else if (!strcmp(argv[i], "--asize")) { if (i+1 >= argc) { printf ("--asize needs an argument\n"); fail = true; } else { sscanf(argv[++i], "%u", &opts.asize); if (opts.asize > 16384 || opts.asize <= 0) { printf ("--asize must not exceed 16K pages because randindex[] uses 16-bit page numbers\n"); fail = true; } } } else if (!strcmp(argv[i], "--mode")) { if (i+1 >= argc) { printf ("--mode needs an argument\n"); fail = true; } else { sscanf(argv[++i], "%i", &opts.mode); if (opts.mode > 1 || opts.mode < 0) { printf ("--mode out of range\n"); fail = true; } } } else if (!strcmp(argv[i], "--dont_remap")) { opts.dont_remap = 1; } else if (!strcmp(argv[i], "--chain_deps")) { if (i+3 >= argc) { printf ("--chain_deps needs three arguments\n"); fail = true; } else { int t1, t2, t3; sscanf(argv[++i], "%i", &t1); sscanf(argv[++i], "%i", &t2); sscanf(argv[++i], "%i", &t3); opts.chain_deps = (t1 ? 4 : 0) | (t2 ? 2 : 0) | (t3 ? 1:0) ; } } else if (!strcmp(argv[i], "--chain_depsb")) { if (i+1 >= argc) { printf ("--chain_depsb needs one argument\n"); fail = true; } else { int t1; sscanf(argv[++i], "%i", &t1); opts.chain_deps = t1 & 7; } } else { printf ("unknown option '%s'\n", argv[i]); fail = true; } } if (fail) { printf ( "\n" "Usage: paging [options]\n" " Creates some arrays, changes page table mappings to map the page tables of a large\n" " array to be visible through the small array. Then changes the page table mappings\n" " of the large array and see which mapping is actually used by the processor\n" " Options:\n" " --asize Array size, in 4 KB pages. Default is 4096. Normally chosen\n" " to be big enough to overflow all TLBs and cause page walks\n" " Memory footprint is roughly 4 KB + asize * (8 byte + one PTE)\n" " --preload Before the start of the test, try to preload the TLB with mappings\n" " pointing to page 1 or 2. 0 means flush the TLB before tests.\n" " Used to test how long PTEs can stay in the TLB with smaller asize.\n" " --mode <0-1> Which test to run. Mode 0 tests for speculative page walks. Mode 1\n" " measures whether page walks can be overlapped with arithmetic\n" " --dont_remap Remove PTE remapping to test for any TLB coherence mechanisms.\n" " --chain_deps c b a\n" " --chain_depsb n\n" " For mode 1 (see source!), controls the data dependenies between the\n" " imul chain and the load. 1 means dependent. There are three dependencies:\n" " a: Controls whether the load address is dependent on the imul chain\n" " b: Controls whether the imul chain is dependent on the previous load\n" " c: Controls whether the imul chain is dependent on the previous imul chain\n" " n: A single integer where bits [2:0] are c, b, and a\n" " Default is 1 0 0 (or n=4)\n" " This allows detecting under which cases arithmetic operations can be\n" " overlapped with the page walk.\n" ); } return !fail; } int main(int argc, char *argv[]) { if (!parse_opts(argc, argv)) { printf ("Bad options.\n"); return 1; } const unsigned int ASIZE = opts.asize; // Size of page tables for bigbuf, in pages. int num_pde_to_map = ((ASIZE+2*pte_per_pde-2)/pte_per_pde); // Lets see if our module to retrieve cr3 exists... FILE *f = fopen("/proc/pgmod","rt"); if (!f) { printf("Can't open /proc/pgmod. Module not loaded?\n"); return -1; } fd_devmem = open("/dev/mem", O_RDWR | O_SYNC); if (fd_devmem == -1) { printf ("Can't open /dev/mem. Permissions?\n"); return -2; } uint64_t cr3, cr4; fscanf(f, "%llx ", &cr3); fscanf(f, "%llx ", &cr4); fclose(f); printf ("Compiled for %s\n", translation_levels == 4 ? "x86-64" : translation_levels == 3 ? "x86 PAE" : "x86 non-PAE"); printf ("CR3 appears to be %016llx\n", cr3); printf (" num_pde_to_map = %d\n", num_pde_to_map); printf (" array size = %d pages\n", opts.asize); printf (" translation levels = %d\n", translation_levels); printf ("CR4 appears to be %016llx\n", cr4); printf (" PAE = %s\n", (cr4 & 0x20) ? "Enabled" : "Disabled"); if ((pte_per_pde == 512) != !!(cr4 & 0x20)) { printf ("PAE state unexpected. Quitting.\n"); // PAE has 512 PTEs per page table, non-PAE has 1024. return -3; } pte_t *old_pagetable = (pte_t*) malloc(ASIZE*(sizeof(pte_t))); pte_t *old_pgmap_ptes = (pte_t*) malloc(num_pde_to_map*(sizeof(pte_t))); pte_t *scratch_memory = (pte_t*) malloc(ASIZE*(sizeof(pte_t))); // For mode = 2 void *bigbuf = mmap(NULL, ASIZE*4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS | MAP_POPULATE, -1, 0); if (bigbuf == MAP_FAILED) { printf ("mmap of %d MB fail\n", ASIZE*4096/1048576); } else { printf ("mmap of %d MB at %p\n", ASIZE*4096/1048576, bigbuf); } if (mlock (bigbuf, ASIZE*4096) != 0) { printf ("mlock of %d MB failed.\n", ASIZE*4096/1048576); } pte_t *pt_window = (pte_t*) mmap(NULL, num_pde_to_map*4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS | MAP_POPULATE, -1, 0); if (pt_window == MAP_FAILED) { printf ("mmap of pt_window array failed\n"); } else { printf ("mmap of pt_window at %p\n", pt_window); } if (mlock (pt_window, num_pde_to_map*4096) != 0) { printf ("mlock of pt_window failed.\n"); } // Sanity check: Do a translation for every page in bigbuf to check for hugepages before // we start tinkering with page table mappings. for (unsigned int i = 0; i < ASIZE; i++) { translate(cr3, (uintptr_t)bigbuf + (i<<12), 0); } // Back up 9 mappings used for the page table window and remap it to the page tables for bigbuf. for (int i=0;i> 12) & (pte_per_pde-1)) + 0]; // dont_remap causes the PTE update to write into unrelated memory. // The operations are the same. This tests for PTE coherence (whether writing into the PTE // behaves differently than writing into some other location in memory) volatile pte_t *p_tinkered_pte = opts.dont_remap ? scratch_memory : p_pte; // Back up page table that was exposed by pt_window for (unsigned int i=0;i=1;i--) { printf ("\n -> %016llx", (uint64_t)translate(cr3, (uintptr_t)bigbuf, i)); } printf (" pte\n"); pte_t map_pte1 = old_pagetable[0]; // Use the first two pages of bigbuf as real backing memory. pte_t map_pte2 = old_pagetable[1]; for (int i=0;i<1024;i++) { ((int*)(bigbuf))[i] = 1; ((int*)(bigbuf))[i+1024] = 2; } // Initialize array with random permutation of pages to visit. uint16_t *randindex = new uint16_t[ASIZE]; uint32_t *randindex2 = new uint32_t[ASIZE*2]; // struct of 2 elements for (unsigned int i=0;i 4096) inner_its = 4096; if (opts.mode == 0) { unsigned int lcounts[4] = {0}; // 32-bit: We're out of registers anyway. Accumulate results in a memory variable to avoid branches. start_time = rdtsc(); for (int j=0;j