// // Pointer chasing // // // Author: P.J. Drongowski // 2 June 2013 // // Copyright (c) 2013 Paul. J. Drongowski // #include <sys/types.h> #include <unistd.h> #include <stdlib.h> #include <stdio.h> #include "../test_common/rpi_pmu.h" #include "../test_common/test_common.h" #define RESULT_FILE_NAME "chase.txt" #define TEST_ITERATIONS 8192 // // Default values for the pointer array and the cache line size // #define ARRAY_ELEMENTS (16 * 1024) #define POINTER_SIZE (sizeof(void*)) #define LINE_SIZE 32 #define POINTERS_PER_LINE (LINE_SIZE / POINTER_SIZE) typedef struct _CacheLine { struct _CacheLine* ptrCacheLine[POINTERS_PER_LINE] ; } CacheLine ; long int array_elements = ARRAY_ELEMENTS ; long int line_size = LINE_SIZE ; long int pointer_size = POINTER_SIZE ; long int pointers_per_line = POINTERS_PER_LINE ; // // The program allocates an array of CacheLine structures. The number // of elements (cache lines) is determined by the variable array_elements. // The array is initialized such that it forms a linked list of elements. // Elements in the upper half of the array point to elements in the // lower half, and vice versa. Thus, access will ping-pong between // the upper and lower halves when the linked list is walked (pointers // are chased.) // CacheLine* array = NULL ; // // The program has two phases: initialization and pointer chasing. // The process-to-core affinity is set at the beginning of each phase. // Windows uses a first touch physical allocation scheme, so it's // possible to force physical allocation of the pointer array to a // particular core and to force chasing on the same or different core. // This should support NUMA experiments. // int initialization_core = 0 ; // Core (CPU) for array allocation/initialization int chase_core = 0 ; // Core (CPU) for pointer chasing // // Run options (some not implemented) // int a_flag = 0 ; // Set core for array allocation/initialization int c_flag = 0 ; // Measure L1 data cache events int e_flag = 0 ; // Measure explicit external data access events int i_flag = 0 ; // Measure instruction and cycle events int l_flag = 0 ; // Set cache line size int n_flag = 0 ; // Set the number of tests (iterattion count) int p_flag = 0 ; // Set core for pointer chasing int s_flag = 0 ; // Set array size (in units of 1024 elements) int t_flag = 0 ; // Measure TLB events #define OPTIONS "dhi:v" // Currently unimplemented (getopt) char *usage_strings[] = { " -a N Allocate/initialize array on core N", " -c Measure L1 data cache events", " -d Display debugging information", " -e Measure explicit external data access events", " -h Display usage information", " -i Measure instruction and cycle events", " -l N Set the cache line size to N bytes", " -n N Set the number of tests (iteration count)", " -p N Perform pointer chasing on core N", " -s N Allocate an array with N * 1024 elements", " -t Measure TLB events", " -v Display more information (verbose output)", NULL } ; // // Allocate and initialize the test array // void initialize_array() { long int steps ; // Number of initialization setps long int top ; // Index of top cache line long int bottom ; // Index of bottom cache line long int i ; // Array index counter int j ; // Pointer index counter if ((array = (CacheLine*) malloc(array_elements * line_size)) == NULL) { fprintf(stderr, "Couldn't allocate test array\n") ; exit( EXIT_FAILURE ) ; } // // Set the entire array to zero. // for (i = 0 ; i < array_elements ; i++) { for (j = 0 ; j < pointers_per_line ; j++) { array[i].ptrCacheLine[j] = NULL ; } } // // The test array is divided into two halves: top and // bottom. Set the pointers // so that memory accesses ping-pong between the two // halves of the array. Ping-pong between cache lines. // steps = array_elements / 2 ; bottom = 0 ; top = steps ; for (i = 0 ; i < steps ; i++) { array[bottom].ptrCacheLine[0] = &array[top] ; array[top].ptrCacheLine[0] = &array[bottom+1] ; top++ ; bottom++ ; } // // Terminate the linked list // array[(array_elements-1)].ptrCacheLine[0] = NULL ; if (d_flag) { for (i = 0 ; i < array_elements ; i++) { // fprintf(result_file, "%16" FMT64 "x\n", array[i].ptrCacheLine[0]) ; fprintf(result_file, "%d %8lx %8lx\n", i, &array[i].ptrCacheLine[0], array[i].ptrCacheLine[0]) ; } } } // // Examine the assembler code generated for test_loop_prototype // to guide implementation of the assembly language test code // void test_loop_prototype(CacheLine* array) { CacheLine* p ; for(p = array ; p != NULL ; p = p->ptrCacheLine[0]) ; } // // Perform the test itself. Since all of the test-related // activity starts in this function, it should be easier to // track samples back to the test itself. // void run_no_events(int iteration_count) { int i ; for (i = iteration_count ; i > 0 ; i--) { test_loop_prototype(array) ; } } void run_external(int iteration_count) { int i ; start_counting(ARMV6_EVENT_EXPL_D_ACCESS, ARMV6_EVENT_DDEP_STALL) ; for (i = iteration_count ; i > 0 ; i--) { test_loop_prototype(array) ; } stop_counting() ; } void run_ipc(int iteration_count) { int i ; start_counting(ARMV6_EVENT_INSTR_EXEC, ARMV6_EVENT_CPU_CYCLES) ; for (i = iteration_count ; i > 0 ; i--) { test_loop_prototype(array) ; } stop_counting() ; } void run_cache(int iteration_count) { int i ; start_counting(ARMV6_EVENT_DCACHE_CACCESS, ARMV6_EVENT_DCACHE_MISS) ; for (i = iteration_count ; i > 0 ; i--) { test_loop_prototype(array) ; } stop_counting() ; } void run_tlb(int iteration_count) { int i ; start_counting(ARMV6_EVENT_DTLB_MISS, ARMV6_EVENT_MAIN_TLB_MISS) ; for (i = iteration_count ; i > 0 ; i--) { test_loop_prototype(array) ; } stop_counting() ; } // // Handle command line arguments, bind the process to a // particular CPU, run the test and write the results to // the results file. // int main(int argc, char *argv[]) { int arg_count ; pid_t my_process_id ; float cpu_clock ; // // Variables to hold the run parameters. These could be // modified by command line arguments (if command line // parsing is ever implemented. :-) // int iteration_count = TEST_ITERATIONS ; // // Variables for measuring expired CPU cycles // unsigned long long int before ; // Perf counter at start of test unsigned long long int after ; // Oerf counter at end of test unsigned long long int total_cycles ; // Cycles consumed by test // // Variables for computing expected results // unsigned long long int total_retires ; // Total retire events unsigned long long int total_dtlb_misses ; // Total DTLB miss events unsigned long long int total_dc_accesses ; // Total DC access events unsigned long long int total_dc_misses ; // Total DC miss events unsigned long long int expected_cycle_samples ; // Expected cycle samples unsigned long long int expected_retire_samples ; // Expected ret samples unsigned long long int expected_dc_access_samples ; // Exp DC access samples unsigned long long int expected_dc_miss_samples ; // Exp DC miss samples unsigned long long int expected_dtlb_miss_samples ; // Exp DTLB miss samples #ifdef _DEBUG d_flag = 1 ; #endif // Parse any command line arguments. Use simple one character // option names preceded by a '-' character. for (arg_count = 1 ; arg_count < argc ; arg_count++) { if( argv[arg_count][0] == '-' ) { switch( argv[arg_count][1] ) { case 'a' : { // Set processor core for array allocation and initialization fprintf(stderr, "*warning* -a is UNIMPLEMENTED\n") ; a_flag = 1 ; arg_count++ ; if (arg_count < argc) { initialization_core = atoi(argv[arg_count]) ; } break ; } case 'c' : { // Measure L1 data cache (DC) events c_flag = 1 ; break ; } case 'd' : { // Enable debug output d_flag = 1 ; break ; } case 'e' : { // Measure explicit external data access events e_flag = 1 ; break ; } case 'h' : { // Display usage information display_usage_info(argv[0], usage_strings) ; h_flag = 1 ; break ; } case 'i' : { // Measure instructions per cycle (IPC) i_flag = 1 ; break ; } case 'l' : { // Set cache line size to N bytes l_flag = 1 ; arg_count++ ; if (arg_count < argc) { line_size = LINE_SIZE ; // line_size = atoi(argv[arg_count]) ; // pointers_per_line = (line_size / pointer_size) ; } break ; } case 'n' : { // Set iteration count (number of tests) n_flag = 1 ; arg_count++ ; if (arg_count < argc) { iteration_count = atoi(argv[arg_count]) ; } break ; } case 'p' : { // Set processor core for pointer chasing fprintf(stderr, "*warning* -p is UNIMPLEMENTED\n") ; p_flag = 1 ; arg_count++ ; if (arg_count < argc) { chase_core = atoi(argv[arg_count]) ; } break ; } case 's' : { // Set array size to N elements s_flag = 1 ; arg_count++ ; if (arg_count < argc) { long int elements = ARRAY_ELEMENTS ; elements = atoi(argv[arg_count]) ; array_elements = elements ; } break ; } case 't' : { // Measure translation lookaside buffer (TLB) events t_flag = 1 ; break ; } case 'v' : { v_flag = 1 ; break ; } default: { printf("Illegal option: %c\n", argv[arg_count][1]) ; break ; } } } } // Get ID of current process and CPU clock speed my_process_id = getpid() ; cpu_clock = 1000000.0f * (float)get_cpu_speed() ; if (create_result_file(RESULT_FILE_NAME) == 0) { exit( EXIT_FAILURE ) ; } // Allocate and initialize the pointer chasing array initialize_array() ; // Snap performance counter and TOD at start of test run before = getTimeStamp() ; // Perform pointer chasing if (c_flag) { run_cache(iteration_count) ; } else if (e_flag) { run_external(iteration_count) ; } else if (i_flag) { run_ipc(iteration_count) ; } else if (t_flag) { run_tlb(iteration_count) ; } else { run_no_events(iteration_count) ; } // Snap performance counter after the test loop after = getTimeStamp() ; total_cycles = after - before ; if (d_flag) { printf("*debug* Before: %16" FMT64 "d\n", before) ; printf("*debug* After: %16" FMT64 "d\n", after) ; } // Display estimated results print_heading("Pointer chasing test") ; print_system_info() ; fprintf(result_file,"Run information\n") ; fprintf(result_file," PID: %16d\n", my_process_id) ; fprintf(result_file," Allocation core: %16d\n", initialization_core) ; fprintf(result_file," Chase core: %16d\n", chase_core) ; fprintf(result_file," CPU clock speed (GHz): %16.4f\n", (float)(cpu_clock / 1000000000.0)) ; fprintf(result_file," Number of test iterations: %16d\n", iteration_count) ; fprintf(result_file," Array elements/cache lines: %16d\n", array_elements) ; fprintf(result_file," Array size/bytes: %16d\n", array_elements * line_size) ; fprintf(result_file," Cache line size: %16d\n", line_size) ; fprintf(result_file," Pointer size: %16d\n", pointer_size) ; fprintf(result_file," Pointers per line: %16d\n", pointers_per_line) ; if (c_flag || e_flag || i_flag || t_flag) { fprintf(result_file,"Performance Monitor events\n") ; print_counts(result_file) ; } fprintf(result_file, "Run execution summary\n") ; print_process_times() ; print_elapsed_time() ; close_result_file() ; return( EXIT_SUCCESS ) ; }