/* Henry Wong Microbenchmarking Return Address Branch Prediction http://blog.stuffedcow.net/2018/04/ras-microbenchmarks/ 2018-03-31 Test to probe for a return address stack branch predictor (Also known by various other names: RAS, RSB, return stack) This program is used to observe how calls beahve when executing speculatively (e.g., limits on the number of in-flight calls). Observations are made using hardware performance counters while running the program. There is no output. For example, a count of speculatively-executed call instructions might be interesting: perf stat -e br_inst_exec.all_direct_near_call As written, it only works in 32-bit x86 mode. On a 64-bit machine, compile with "gcc -O2 -m32 spec.c -o spec". AMD K8/K10: Do not place ret instructions at even addresses that are not divisible by 16. There are no branch indicator bits there. Also, do not have more than three branches in any aligned 16-byte block. Maximum calls in flight, preceded by 1 return (num calls - 1) IVB: 15 SNB: 15 HSW: 14 NHM: 15 P6: 13? Max calls if preceded by two returns (num calls - 2) P6: 1 NHM: 1 SNB: 15 IVB: 15 HSW: 14 Max calls if preceded by three returns (num calls - 3 non-speculative calls per iteration) P6: 1 NHM: 1 IVB: 15 SNB: 15 HSW: 14 Maximum branches (jmp only, including the return) IVB, SNB, HSW: 48 NHM: 122 using aam, 123 using offset adds, 92 using imul P6: 40 Maximum branches in flight (sum of call+jmp) IVB: 47 SNB: 47 HSW: 47 NHM: ~63? (13+50) AMD has a perf counter for return stack overflows Bulldozer: RETIRED_BRANCH_INSTRUCTIONS counter is not accurate. It increases with delay?? - Can't measure with perfcount: No speculative executed counters. */ #include #include #include // The body of the test function. Runs 16K copies of FRAG in a partially-unrolled loop. // Don't use eax as the loop counter because one of the FRAGs (f_jmp) clobbers eax. #define L1 (16*4) #define L2 (512*4) #define COMMA , #define T_BODY(FRAG) T_BODY_IC(FRAG, , "eax" COMMA "edx") #define T_BODY_IC(FRAG,INPUTS,CLOBBER) do { for(int i=262144; --i;) { \ asm( \ FRAG \ FRAG \ FRAG \ FRAG \ FRAG \ FRAG \ FRAG \ FRAG \ FRAG \ FRAG \ FRAG \ FRAG \ FRAG \ FRAG \ FRAG \ FRAG \ : :INPUTS : CLOBBER); \ } } while(0) #define STR_(x) #x #define STR(x) STR_(x) #define STR4(x) STR(4*(x)) #define REPH(N) REP_##N #define REP(N,X) REPH(N)(X) #define REP_0(X) #define REP_1(X) X #define REP_2(X) X X #define REP_3(X) X X X #define REP_4(X) X X X X #define REP_5(X) X X X X X #define REP_6(X) X X X X X X #define REP_7(X) X X X X X X X #define REP_8(X) X X X X X X X X #define REP_9(X) REP_4(X) REP_5(X) #define REP_10(X) REP_5(X) REP_5(X) #define REP_11(X) REP_5(X) REP_6(X) #define REP_12(X) REP_6(X) REP_6(X) #define REP_13(X) REP_6(X) REP_7(X) #define REP_14(X) REP_7(X) REP_7(X) #define REP_15(X) REP_7(X) REP_8(X) #define REP_16(X) REP_8(X) REP_8(X) #define REP_17(X) REP_8(X) REP_9(X) #define REP_18(X) REP_9(X) REP_9(X) #define REP_19(X) REP_9(X) REP_10(X) #define REP_20(X) REP_10(X) REP_10(X) #define REP_21(X) REP_10(X) REP_11(X) #define REP_22(X) REP_11(X) REP_11(X) #define REP_23(X) REP_11(X) REP_12(X) #define REP_24(X) REP_12(X) REP_12(X) #define REP_25(X) REP_12(X) REP_13(X) #define REP_26(X) REP_13(X) REP_13(X) #define REP_27(X) REP_13(X) REP_14(X) #define REP_28(X) REP_14(X) REP_14(X) #define REP_29(X) REP_14(X) REP_15(X) #define REP_30(X) REP_15(X) REP_15(X) #define REP_31(X) REP_15(X) REP_16(X) #define REP_32(X) REP_16(X) REP_16(X) #define REP_33(X) REP_16(X) REP_17(X) #define REP_34(X) REP_17(X) REP_17(X) #define REP_35(X) REP_17(X) REP_18(X) #define REP_36(X) REP_18(X) REP_18(X) #define REP_37(X) REP_18(X) REP_19(X) #define REP_38(X) REP_19(X) REP_19(X) #define REP_39(X) REP_19(X) REP_20(X) #define REP_40(X) REP_20(X) REP_20(X) #define REP_41(X) REP_20(X) REP_21(X) #define REP_42(X) REP_21(X) REP_21(X) #define REP_43(X) REP_21(X) REP_22(X) #define REP_44(X) REP_22(X) REP_22(X) #define REP_45(X) REP_22(X) REP_23(X) #define REP_46(X) REP_23(X) REP_23(X) #define REP_47(X) REP_23(X) REP_24(X) #define REP_48(X) REP_24(X) REP_24(X) #define REP_49(X) REP_24(X) REP_25(X) #define REP_50(X) REP_25(X) REP_25(X) #define REP_51(X) REP_25(X) REP_26(X) #define REP_52(X) REP_26(X) REP_26(X) #define REP_53(X) REP_26(X) REP_27(X) #define REP_54(X) REP_27(X) REP_27(X) #define REP_55(X) REP_27(X) REP_28(X) #define REP_56(X) REP_28(X) REP_28(X) #define REP_57(X) REP_28(X) REP_29(X) #define REP_58(X) REP_29(X) REP_29(X) #define REP_59(X) REP_29(X) REP_30(X) #define REP_60(X) REP_30(X) REP_30(X) #define REP_61(X) REP_30(X) REP_31(X) #define REP_62(X) REP_31(X) REP_31(X) #define REP_63(X) REP_31(X) REP_32(X) #define REP_64(X) REP_32(X) REP_32(X) #define REP_64(X) REP_32(X) REP_32(X) #define REP_65(X) REP_32(X) REP_33(X) #define REP_66(X) REP_33(X) REP_33(X) #define REP_67(X) REP_33(X) REP_34(X) #define REP_68(X) REP_34(X) REP_34(X) #define REP_69(X) REP_34(X) REP_35(X) #define REP_70(X) REP_35(X) REP_35(X) #define REP_71(X) REP_35(X) REP_36(X) #define REP_72(X) REP_36(X) REP_36(X) #define REP_73(X) REP_36(X) REP_37(X) #define REP_74(X) REP_37(X) REP_37(X) #define REP_75(X) REP_37(X) REP_38(X) #define REP_76(X) REP_38(X) REP_38(X) #define REP_77(X) REP_38(X) REP_39(X) #define REP_78(X) REP_39(X) REP_39(X) #define REP_79(X) REP_39(X) REP_40(X) #define REP_80(X) REP_40(X) REP_40(X) #define REP_81(X) REP_40(X) REP_41(X) #define REP_82(X) REP_41(X) REP_41(X) #define REP_83(X) REP_41(X) REP_42(X) #define REP_84(X) REP_42(X) REP_42(X) #define REP_85(X) REP_42(X) REP_43(X) #define REP_86(X) REP_43(X) REP_43(X) #define REP_87(X) REP_43(X) REP_44(X) #define REP_88(X) REP_44(X) REP_44(X) #define REP_89(X) REP_44(X) REP_45(X) #define REP_90(X) REP_45(X) REP_45(X) #define REP_91(X) REP_45(X) REP_46(X) #define REP_92(X) REP_46(X) REP_46(X) #define REP_93(X) REP_46(X) REP_47(X) #define REP_94(X) REP_47(X) REP_47(X) #define REP_95(X) REP_47(X) REP_48(X) #define REP_96(X) REP_48(X) REP_48(X) #define REP_97(X) REP_48(X) REP_49(X) #define REP_98(X) REP_49(X) REP_49(X) #define REP_99(X) REP_49(X) REP_50(X) #define REP_100(X) REP_50(X) REP_50(X) #define REP_101(X) REP_50(X) REP_51(X) #define REP_102(X) REP_51(X) REP_51(X) #define REP_103(X) REP_51(X) REP_52(X) #define REP_104(X) REP_52(X) REP_52(X) #define REP_105(X) REP_52(X) REP_53(X) #define REP_106(X) REP_53(X) REP_53(X) #define REP_107(X) REP_53(X) REP_54(X) #define REP_108(X) REP_54(X) REP_54(X) #define REP_109(X) REP_54(X) REP_55(X) #define REP_110(X) REP_55(X) REP_55(X) #define REP_111(X) REP_55(X) REP_56(X) #define REP_112(X) REP_56(X) REP_56(X) #define REP_113(X) REP_56(X) REP_57(X) #define REP_114(X) REP_57(X) REP_57(X) #define REP_115(X) REP_57(X) REP_58(X) #define REP_116(X) REP_58(X) REP_58(X) #define REP_117(X) REP_58(X) REP_59(X) #define REP_118(X) REP_59(X) REP_59(X) #define REP_119(X) REP_59(X) REP_60(X) #define REP_120(X) REP_60(X) REP_60(X) #define REP_121(X) REP_60(X) REP_61(X) #define REP_122(X) REP_61(X) REP_61(X) #define REP_123(X) REP_61(X) REP_62(X) #define REP_124(X) REP_62(X) REP_62(X) #define REP_125(X) REP_62(X) REP_63(X) #define REP_126(X) REP_63(X) REP_63(X) #define REP_127(X) REP_63(X) REP_64(X) #define REP_128(X) REP_64(X) REP_64(X) #define REP_129(X) REP_64(X) REP_65(X) #define REP_130(X) REP_65(X) REP_65(X) #define REP_131(X) REP_65(X) REP_66(X) #define REP_132(X) REP_66(X) REP_66(X) #define REP_133(X) REP_66(X) REP_67(X) #define REP_134(X) REP_67(X) REP_67(X) #define REP_135(X) REP_67(X) REP_68(X) #define REP_136(X) REP_68(X) REP_68(X) #define REP_137(X) REP_68(X) REP_69(X) #define REP_138(X) REP_69(X) REP_69(X) #define REP_139(X) REP_69(X) REP_70(X) #define REP_140(X) REP_70(X) REP_70(X) #define REP_141(X) REP_70(X) REP_71(X) #define REP_142(X) REP_71(X) REP_71(X) #define REP_143(X) REP_71(X) REP_72(X) #define REP_144(X) REP_72(X) REP_72(X) #define REP_145(X) REP_72(X) REP_73(X) #define REP_146(X) REP_73(X) REP_73(X) #define REP_147(X) REP_73(X) REP_74(X) #define REP_148(X) REP_74(X) REP_74(X) #define REP_149(X) REP_74(X) REP_75(X) #define REP_150(X) REP_75(X) REP_75(X) #define REP_151(X) REP_75(X) REP_76(X) #define REP_152(X) REP_76(X) REP_76(X) #define REP_153(X) REP_76(X) REP_77(X) #define REP_154(X) REP_77(X) REP_77(X) #define REP_155(X) REP_77(X) REP_78(X) #define REP_156(X) REP_78(X) REP_78(X) #define REP_157(X) REP_78(X) REP_79(X) #define REP_158(X) REP_79(X) REP_79(X) #define REP_159(X) REP_79(X) REP_80(X) #define REP_160(X) REP_80(X) REP_80(X) #define REP_161(X) REP_80(X) REP_81(X) #define REP_162(X) REP_81(X) REP_81(X) #define REP_163(X) REP_81(X) REP_82(X) #define REP_164(X) REP_82(X) REP_82(X) #define REP_165(X) REP_82(X) REP_83(X) #define REP_166(X) REP_83(X) REP_83(X) #define REP_167(X) REP_83(X) REP_84(X) #define REP_168(X) REP_84(X) REP_84(X) #define REP_169(X) REP_84(X) REP_85(X) #define REP_170(X) REP_85(X) REP_85(X) #define REP_171(X) REP_85(X) REP_86(X) #define REP_172(X) REP_86(X) REP_86(X) #define REP_173(X) REP_86(X) REP_87(X) #define REP_174(X) REP_87(X) REP_87(X) #define REP_175(X) REP_87(X) REP_88(X) #define REP_176(X) REP_88(X) REP_88(X) #define REP_177(X) REP_88(X) REP_89(X) #define REP_178(X) REP_89(X) REP_89(X) #define REP_179(X) REP_89(X) REP_90(X) #define REP_180(X) REP_90(X) REP_90(X) #define REP_181(X) REP_90(X) REP_91(X) #define REP_182(X) REP_91(X) REP_91(X) #define REP_183(X) REP_91(X) REP_92(X) #define REP_184(X) REP_92(X) REP_92(X) #define REP_185(X) REP_92(X) REP_93(X) #define REP_186(X) REP_93(X) REP_93(X) #define REP_187(X) REP_93(X) REP_94(X) #define REP_188(X) REP_94(X) REP_94(X) #define REP_189(X) REP_94(X) REP_95(X) #define REP_190(X) REP_95(X) REP_95(X) #define REP_191(X) REP_95(X) REP_96(X) #define REP_192(X) REP_96(X) REP_96(X) #define REP_193(X) REP_96(X) REP_97(X) #define REP_194(X) REP_97(X) REP_97(X) #define REP_195(X) REP_97(X) REP_98(X) #define REP_196(X) REP_98(X) REP_98(X) #define REP_197(X) REP_98(X) REP_99(X) #define REP_198(X) REP_99(X) REP_99(X) #define REP_199(X) REP_99(X) REP_100(X) #define REP_200(X) REP_100(X) REP_100(X) #define REP_201(X) REP_100(X) REP_101(X) #define REP_202(X) REP_101(X) REP_101(X) #define REP_203(X) REP_101(X) REP_102(X) #define REP_204(X) REP_102(X) REP_102(X) #define REP_205(X) REP_102(X) REP_103(X) #define REP_206(X) REP_103(X) REP_103(X) #define REP_207(X) REP_103(X) REP_104(X) #define REP_208(X) REP_104(X) REP_104(X) #define REP_209(X) REP_104(X) REP_105(X) #define REP_210(X) REP_105(X) REP_105(X) #define REP_211(X) REP_105(X) REP_106(X) #define REP_212(X) REP_106(X) REP_106(X) #define REP_213(X) REP_106(X) REP_107(X) #define REP_214(X) REP_107(X) REP_107(X) #define REP_215(X) REP_107(X) REP_108(X) #define REP_216(X) REP_108(X) REP_108(X) #define REP_217(X) REP_108(X) REP_109(X) #define REP_218(X) REP_109(X) REP_109(X) #define REP_219(X) REP_109(X) REP_110(X) #define REP_220(X) REP_110(X) REP_110(X) #define REP_221(X) REP_110(X) REP_111(X) #define REP_222(X) REP_111(X) REP_111(X) #define REP_223(X) REP_111(X) REP_112(X) #define REP_224(X) REP_112(X) REP_112(X) #define REP_225(X) REP_112(X) REP_113(X) #define REP_226(X) REP_113(X) REP_113(X) #define REP_227(X) REP_113(X) REP_114(X) #define REP_228(X) REP_114(X) REP_114(X) #define REP_229(X) REP_114(X) REP_115(X) #define REP_230(X) REP_115(X) REP_115(X) #define REP_231(X) REP_115(X) REP_116(X) #define REP_232(X) REP_116(X) REP_116(X) #define REP_233(X) REP_116(X) REP_117(X) #define REP_234(X) REP_117(X) REP_117(X) #define REP_235(X) REP_117(X) REP_118(X) #define REP_236(X) REP_118(X) REP_118(X) #define REP_237(X) REP_118(X) REP_119(X) #define REP_238(X) REP_119(X) REP_119(X) #define REP_239(X) REP_119(X) REP_120(X) #define REP_240(X) REP_120(X) REP_120(X) #define REP_241(X) REP_120(X) REP_121(X) #define REP_242(X) REP_121(X) REP_121(X) #define REP_243(X) REP_121(X) REP_122(X) #define REP_244(X) REP_122(X) REP_122(X) #define REP_245(X) REP_122(X) REP_123(X) #define REP_246(X) REP_123(X) REP_123(X) #define REP_247(X) REP_123(X) REP_124(X) #define REP_248(X) REP_124(X) REP_124(X) #define REP_249(X) REP_124(X) REP_125(X) #define REP_250(X) REP_125(X) REP_125(X) #define REP_251(X) REP_125(X) REP_126(X) #define REP_252(X) REP_126(X) REP_126(X) #define REP_253(X) REP_126(X) REP_127(X) #define REP_254(X) REP_127(X) REP_127(X) #define REP_255(X) REP_127(X) REP_128(X) #define REP_256(X) REP_128(X) REP_128(X) #define REP_257(X) REP_128(X) REP_129(X) #define REP_258(X) REP_129(X) REP_129(X) #define REP_259(X) REP_129(X) REP_130(X) #ifndef DELAYAMT #define DELAYAMT 8 #endif asm(R"( .align 4096; delay_and_return4k: mov (%esp), %eax )" #define REPH2(X) REPH(X) //REPH2(DELAYAMT) ("aam\n") // ascii adjust ax after multiply: A slow instruction used as a delay. //REPH2(DELAYAMT) ("imul $-1, %eax\n") // imul used as a delay. REPH2(DELAYAMT) ("add $0, 2(%esp); add $0, 1(%esp)\n") // dependent chain of partial-overlap store to load forwards as delay. Doesn't work if CPU supports partial-overlap store-to-load forwards. R"( and $0x10, %eax and $0x1, %eax # eax is now zero add $4091, %eax add %eax, (%esp) ret .align 4096; delay_and_return: mov (%esp), %eax # delay must be data-dependent on the call instruction (return address). )" #define REPH2(X) REPH(X) //REPH2(DELAYAMT) ("aam\n") //REPH2(DELAYAMT) ("imul $-1, %eax\n") REPH2(DELAYAMT) ("add $0, 2(%esp); add $0, 1(%esp)\n") R"( and $0x10, %eax and $0x1, %eax # eax is now zero add $11, %eax # Return to 16 bytes beyond the call (cause a RAS return misprediction). Use the 11 bytes to do things speculatively and observe what happens. add %eax, (%esp) ret )"); asm( R"( .align 4096 pf: jmp 99f many_calls: call many_calls jmp_call: jmp 1f nop 1: call jmp_call jmp_jmp_call: jmp 1f nop 1: jmp 1f nop 1: call jmp_jmp_call jmp4_call: 1: jmp 1f; nop; 1: jmp 1f; nop; 1: jmp 1f; nop; 1: jmp 1f; nop; nop 1: call jmp4_call jmp8_call: 1: jmp 1f; nop; 1: jmp 1f; nop; 1: jmp 1f; nop; 1: jmp 1f; nop; 1: jmp 1f; nop; 1: jmp 1f; nop; 1: jmp 1f; nop; 1: jmp 1f; nop; nop 1: call jmp8_call 99: ret )"); void spec_calls() { T_BODY(".align 16; call delay_and_return\n" "call many_calls\n" ".align 16\n"); } void spec_jmp() { T_BODY(".align 16; call delay_and_return\n" "1: jmp 1b\n" ".align 16\n"); } void spec_jmp_call() { T_BODY(".align 16; call delay_and_return\n" "1: call jmp_call\n" ".align 16\n"); } void spec_jmp_jmp_call() { T_BODY(".align 16; call delay_and_return\n" "1: call jmp_jmp_call\n" ".align 16\n"); } void spec_jmp4_call() { T_BODY(".align 16; call delay_and_return\n" "1: call jmp4_call\n" ".align 16\n"); } void spec_jmp8_call() { T_BODY(".align 16; call delay_and_return\n" "1: call jmp8_call\n" ".align 16\n"); } void spec_nop() { T_BODY(".align 4096; call delay_and_return4k\n" ".fill 4091,1,0x90\n" ".align 4096\n"); } void spec_ret1_call() { T_BODY(".align 16; call 1f\n" "call many_calls\n" ".align 16; 1: call delay_and_return\n" "ret\n" ".align 16\n" "add $4, %%esp\n"); } void spec_ret2_call() { T_BODY(".align 16; call 1f\n" "call many_calls\n" ".align 16;1: call 1f\n" "ret\n" ".align 16; 1: call delay_and_return\n" "ret\n" ".align 16\n" "add $8, %%esp\n"); } void check_32bit() { if (sizeof(void*) != 4) { // Because I made assumptions about pointers being 32 bits (in the inline assembly) and instruction encodings. // It could be ported to 64 bit without much work. printf ("This program only works in 32-bit mode\n"); exit(1); } } int main(int argc, char* argv[]) { check_32bit(); int test = -1; if (argc >= 2) sscanf(argv[1], "%u", &test); asm("call pf\n"); // Page fault in the test code so we can convince the CPU to speculatively execute it later. switch (test) { case 0: spec_calls(); break; case 1: spec_jmp(); break; case 2: spec_jmp_call(); break; case 3: spec_jmp_jmp_call(); break; case 4: spec_jmp4_call(); break; case 5: spec_jmp8_call(); break; case 6: spec_nop(); break; case 7: spec_ret1_call(); // See if the limit on calls also applies to returns. break; case 8: spec_ret2_call(); // See if the limit on calls also applies to returns. break; default: printf ("Unknown test %d\n", test); break; } return 0; }