// // Memory access latency // // // Author: P.J. Drongowski // 8 June 2013 // // Copyright (c) 2013 Paul. J. Drongowski // #include <unistd.h> #include <stdlib.h> #include <stdio.h> #include "../test_common/test_common.h" #include "../test_common/rpi_pmu.h" #define RESULT_FILE_NAME "latency.txt" #define TEST_ITERATIONS 1000 #define MAX_TEST_ITERATIONS 2000 int iteration_count = TEST_ITERATIONS ; // // Store latency samples in this file. Take one sample // during each iteration of the test loop. // #define SAMPLE_FILE_NAME "samples.dat" uint32_t samples[MAX_TEST_ITERATIONS] ; // // Default values for the pointer array and the cache // line size. See chase.c for descriptions of these // data structures. // #define ARRAY_ELEMENTS (16 * 1024) #define POINTER_SIZE (sizeof(void*)) #define LINE_SIZE 32 #define POINTERS_PER_LINE (LINE_SIZE / POINTER_SIZE) typedef struct _CacheLine { struct _CacheLine* ptrCacheLine[POINTERS_PER_LINE] ; } CacheLine ; long int array_elements = ARRAY_ELEMENTS ; long int line_size = LINE_SIZE ; long int pointer_size = POINTER_SIZE ; long int pointers_per_line = POINTERS_PER_LINE ; CacheLine* array = NULL ; // // Command line options // // // Run options (some not implemented) // int l_flag = 0 ; // Set cache line size int n_flag = 0 ; // Set the number of tests (iteration count) int s_flag = 0 ; // Set array size (in units of 1024 elements) #define OPTIONS "dhl:n:s:v" // Currently unimplemented (getopt) char *usage_strings[] = { " -d Display debugging information", " -h Display usage information", " -l N Set the cache line size to N bytes", " -n N Set the number of tests (iteration count)", " -s N Allocate an array with N * 1024 elements", " -v Display more information (verbose output)", NULL } ; // // Allocate and initialize the test array // void initialize_array() { long int steps ; // Number of initialization setps long int top ; // Index of top cache line long int bottom ; // Index of bottom cache line long int i ; // Array index counter int j ; // Pointer index counter if ((array = (CacheLine*) malloc(array_elements * line_size)) == NULL) { fprintf(stderr, "Couldn't allocate test array\n") ; exit( EXIT_FAILURE ) ; } // // Set the entire array to zero. // for (i = 0 ; i < array_elements ; i++) { for (j = 0 ; j < pointers_per_line ; j++) { array[i].ptrCacheLine[j] = NULL ; } } // // The test array is divided into two halves: top and // bottom. Set the pointers // so that memory accesses ping-pong between the two // halves of the array. Ping-pong between cache lines. // steps = array_elements / 2 ; bottom = 0 ; top = steps ; for (i = 0 ; i < steps ; i++) { array[bottom].ptrCacheLine[0] = &array[top] ; array[top].ptrCacheLine[0] = &array[bottom+1] ; top++ ; bottom++ ; } // // Terminate the linked list // array[(array_elements-1)].ptrCacheLine[0] = NULL ; if (d_flag) { for (i = 0 ; i < array_elements ; i++) { // fprintf(result_file, "%16" FMT64 "x\n", array[i].ptrCacheLine[0]) ; fprintf(result_file, "%d %8lx %8lx\n", i, &array[i].ptrCacheLine[0], array[i].ptrCacheLine[0]) ; } } } static inline uint32_t armv6_read_ccr() { uint32_t value ; asm volatile("mrc p15, 0, %0, c15, c12, 1" : "=r"(value)) ; return( value ) ; } uint32_t latency_cycles(CacheLine* linked_list) { register CacheLine* item ; register uint32_t before, after, cycles ; cycles = 0 ; for(item = linked_list ; item != NULL ; ) { before = armv6_read_ccr() ; item = item->ptrCacheLine[0] ; after = armv6_read_ccr() ; cycles += after - before ; } return( cycles ) ; } uint32_t measure_latency_cycles() { uint32_t cycles = 0 ; // Clear the sticky overflow bits armv6_pmcr_write(ARMV6_PMCR_CCOUNT_OVERFLOW | ARMV6_PMCR_COUNT0_OVERFLOW | ARMV6_PMCR_COUNT1_OVERFLOW ) ; // Clear and start the performance counters armv6_pmcr_write(ARMV6_PMCR_ENABLE | ARMV6_PMCR_CCOUNT_RESET | ARMV6_PMCR_CTR01_RESET | (0xFF << ARMV6_PMCR_EVT_COUNT0_SHIFT) | (0xFF << ARMV6_PMCR_EVT_COUNT1_SHIFT) ) ; cycles = latency_cycles(array) ; // Stop the performance counters armv6_pmcr_write(0) ; return( cycles ) ; } uint32_t sample_cycles(CacheLine* linked_list, int n) { register CacheLine* item ; register uint32_t before, after, cycles ; register count = n ; cycles = 0 ; for(item = linked_list ; item != NULL ; ) { if (count-- == 0) { before = armv6_read_ccr() ; item = item->ptrCacheLine[0] ; after = armv6_read_ccr() ; cycles = after - before ; } else { item = item->ptrCacheLine[0] ; } } return( cycles ) ; } // // Run the pointer chasing loop multiple times and // take one latency sample per iteration. // // Compile with -O2 to inline the MCR/MRC instructions! // void sample_latency_cycles() { register int i ; // Clear the sticky overflow bits armv6_pmcr_write(ARMV6_PMCR_CCOUNT_OVERFLOW | ARMV6_PMCR_COUNT0_OVERFLOW | ARMV6_PMCR_COUNT1_OVERFLOW ) ; for(i = iteration_count-1 ; i >= 0 ; i--) { // Clear and start the performance counters armv6_pmcr_write(ARMV6_PMCR_ENABLE | ARMV6_PMCR_CCOUNT_RESET | ARMV6_PMCR_CTR01_RESET | (0xFF << ARMV6_PMCR_EVT_COUNT0_SHIFT) | (0xFF << ARMV6_PMCR_EVT_COUNT1_SHIFT) ) ; // Kind of a bogus "random" sampling within a 64 // iteration window during list traversal samples[i] = sample_cycles(array, 187+(i&0x3F) ) ; // Stop the performance counters armv6_pmcr_write(0) ; } } // // Write the number of samples and the latency // samples to a file. // void write_samples(char* filename) { int i ; FILE* sample_file = NULL ; if ((sample_file = fopen(filename, "w")) == NULL) { fprintf(stderr, "Couldn't create sample file\n") ; return ; } // The first line is the number of samples fprintf(sample_file, "%ld\n", iteration_count) ; for (i = 0 ; i < iteration_count ; i++) { fprintf(sample_file, "%ld\n", samples[i]) ; } fclose(sample_file) ; } int main(int argc, char* argv[]) { int arg_count ; uint32_t cycles = 0 ; double latency = 0.0 ; #ifdef _DEBUG d_flag = 1 ; #endif // Parse any command line arguments. Use simple one character // option names preceded by a '-' character. for (arg_count = 1 ; arg_count < argc ; arg_count++) { if( argv[arg_count][0] == '-' ) { switch( argv[arg_count][1] ) { case 'd' : { // Enable debug output d_flag = 1 ; break ; } case 'h' : { // Display usage information display_usage_info(argv[0], usage_strings) ; h_flag = 1 ; break ; } case 'l' : { // Set cache line size to N bytes l_flag = 1 ; arg_count++ ; if (arg_count < argc) { line_size = LINE_SIZE ; // line_size = atoi(argv[arg_count]) ; // pointers_per_line = (line_size / pointer_size) ; } break ; } case 'n' : { // Set iteration count (number of tests) n_flag = 1 ; arg_count++ ; if (arg_count < argc) { iteration_count = atoi(argv[arg_count]) ; if (iteration_count > MAX_TEST_ITERATIONS) { iteration_count = MAX_TEST_ITERATIONS ; } } break ; } case 's' : { // Set array size to N elements s_flag = 1 ; arg_count++ ; if (arg_count < argc) { long int elements = ARRAY_ELEMENTS ; elements = atoi(argv[arg_count]) ; if (elements < 256) elements = 256 ; array_elements = elements ; } break ; } case 'v' : { v_flag = 1 ; break ; } default: { printf("Illegal option: %c\n", argv[arg_count][1]) ; break ; } } } } // Create the result file if (create_result_file(RESULT_FILE_NAME) == 0) { exit( EXIT_FAILURE ) ; } // Allocate and initialize the linked list initialize_array() ; // Measure latency cycles to compute an average // The first call is needed to warm up the cache cycles = measure_latency_cycles() ; cycles = measure_latency_cycles() ; // Collect and write latency cycles to the // samples file sample_latency_cycles() ; write_samples(SAMPLE_FILE_NAME) ; // Display estimated results print_heading("Memory access latency test") ; print_system_info() ; fprintf(result_file,"Run information\n") ; fprintf(result_file," Number of test iterations: %16d\n", iteration_count) ; fprintf(result_file," List items/cache lines: %16d\n", array_elements) ; fprintf(result_file," Array size/bytes: %16d\n", array_elements * line_size) ; fprintf(result_file," Cache line size: %16d\n", line_size) ; fprintf(result_file," Pointer size: %16d\n", pointer_size) ; fprintf(result_file," Pointers per line: %16d\n", pointers_per_line) ; // Compute and print the average latency using the // total number of latency cycles. This is NOT the // most accurate method due to statistical outliers. // Subtract off a six cycle measurement bias. latency = ((double)cycles / (double)array_elements) - 6.0 ; fprintf(result_file, "Statistics\n") ; fprintf(result_file, " Cycles: %ld\n", cycles) ; fprintf(result_file, " Latency: %6.3f\n", latency) ; fprintf(result_file, "Run execution summary\n") ; print_process_times() ; print_elapsed_time() ; close_result_file() ; return( EXIT_SUCCESS ) ; }