//
// Pointer chasing
//
//
// Author: P.J. Drongowski
// 2 June 2013
//
// Copyright (c) 2013 Paul. J. Drongowski
//
#include <sys/types.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include "../test_common/rpi_pmu.h"
#include "../test_common/test_common.h"
#define RESULT_FILE_NAME "chase.txt"
#define TEST_ITERATIONS 8192
//
// Default values for the pointer array and the cache line size
//
#define ARRAY_ELEMENTS (16 * 1024)
#define POINTER_SIZE (sizeof(void*))
#define LINE_SIZE 32
#define POINTERS_PER_LINE (LINE_SIZE / POINTER_SIZE)
typedef struct _CacheLine
{
struct _CacheLine* ptrCacheLine[POINTERS_PER_LINE] ;
} CacheLine ;
long int array_elements = ARRAY_ELEMENTS ;
long int line_size = LINE_SIZE ;
long int pointer_size = POINTER_SIZE ;
long int pointers_per_line = POINTERS_PER_LINE ;
//
// The program allocates an array of CacheLine structures. The number
// of elements (cache lines) is determined by the variable array_elements.
// The array is initialized such that it forms a linked list of elements.
// Elements in the upper half of the array point to elements in the
// lower half, and vice versa. Thus, access will ping-pong between
// the upper and lower halves when the linked list is walked (pointers
// are chased.)
//
CacheLine* array = NULL ;
//
// The program has two phases: initialization and pointer chasing.
// The process-to-core affinity is set at the beginning of each phase.
// Windows uses a first touch physical allocation scheme, so it's
// possible to force physical allocation of the pointer array to a
// particular core and to force chasing on the same or different core.
// This should support NUMA experiments.
//
int initialization_core = 0 ; // Core (CPU) for array allocation/initialization
int chase_core = 0 ; // Core (CPU) for pointer chasing
//
// Run options (some not implemented)
//
int a_flag = 0 ; // Set core for array allocation/initialization
int c_flag = 0 ; // Measure L1 data cache events
int e_flag = 0 ; // Measure explicit external data access events
int i_flag = 0 ; // Measure instruction and cycle events
int l_flag = 0 ; // Set cache line size
int n_flag = 0 ; // Set the number of tests (iterattion count)
int p_flag = 0 ; // Set core for pointer chasing
int s_flag = 0 ; // Set array size (in units of 1024 elements)
int t_flag = 0 ; // Measure TLB events
#define OPTIONS "dhi:v" // Currently unimplemented (getopt)
char *usage_strings[] = {
" -a N Allocate/initialize array on core N",
" -c Measure L1 data cache events",
" -d Display debugging information",
" -e Measure explicit external data access events",
" -h Display usage information",
" -i Measure instruction and cycle events",
" -l N Set the cache line size to N bytes",
" -n N Set the number of tests (iteration count)",
" -p N Perform pointer chasing on core N",
" -s N Allocate an array with N * 1024 elements",
" -t Measure TLB events",
" -v Display more information (verbose output)",
NULL
} ;
//
// Allocate and initialize the test array
//
void initialize_array()
{
long int steps ; // Number of initialization setps
long int top ; // Index of top cache line
long int bottom ; // Index of bottom cache line
long int i ; // Array index counter
int j ; // Pointer index counter
if ((array = (CacheLine*) malloc(array_elements * line_size)) == NULL) {
fprintf(stderr, "Couldn't allocate test array\n") ;
exit( EXIT_FAILURE ) ;
}
//
// Set the entire array to zero.
//
for (i = 0 ; i < array_elements ; i++) {
for (j = 0 ; j < pointers_per_line ; j++) {
array[i].ptrCacheLine[j] = NULL ;
}
}
//
// The test array is divided into two halves: top and
// bottom. Set the pointers
// so that memory accesses ping-pong between the two
// halves of the array. Ping-pong between cache lines.
//
steps = array_elements / 2 ;
bottom = 0 ;
top = steps ;
for (i = 0 ; i < steps ; i++) {
array[bottom].ptrCacheLine[0] = &array[top] ;
array[top].ptrCacheLine[0] = &array[bottom+1] ;
top++ ;
bottom++ ;
}
//
// Terminate the linked list
//
array[(array_elements-1)].ptrCacheLine[0] = NULL ;
if (d_flag) {
for (i = 0 ; i < array_elements ; i++) {
// fprintf(result_file, "%16" FMT64 "x\n", array[i].ptrCacheLine[0]) ;
fprintf(result_file, "%d %8lx %8lx\n",
i, &array[i].ptrCacheLine[0], array[i].ptrCacheLine[0]) ;
}
}
}
//
// Examine the assembler code generated for test_loop_prototype
// to guide implementation of the assembly language test code
//
void test_loop_prototype(CacheLine* array)
{
CacheLine* p ;
for(p = array ; p != NULL ; p = p->ptrCacheLine[0]) ;
}
//
// Perform the test itself. Since all of the test-related
// activity starts in this function, it should be easier to
// track samples back to the test itself.
//
void run_no_events(int iteration_count)
{
int i ;
for (i = iteration_count ; i > 0 ; i--) {
test_loop_prototype(array) ;
}
}
void run_external(int iteration_count)
{
int i ;
start_counting(ARMV6_EVENT_EXPL_D_ACCESS, ARMV6_EVENT_DDEP_STALL) ;
for (i = iteration_count ; i > 0 ; i--) {
test_loop_prototype(array) ;
}
stop_counting() ;
}
void run_ipc(int iteration_count)
{
int i ;
start_counting(ARMV6_EVENT_INSTR_EXEC, ARMV6_EVENT_CPU_CYCLES) ;
for (i = iteration_count ; i > 0 ; i--) {
test_loop_prototype(array) ;
}
stop_counting() ;
}
void run_cache(int iteration_count)
{
int i ;
start_counting(ARMV6_EVENT_DCACHE_CACCESS, ARMV6_EVENT_DCACHE_MISS) ;
for (i = iteration_count ; i > 0 ; i--) {
test_loop_prototype(array) ;
}
stop_counting() ;
}
void run_tlb(int iteration_count)
{
int i ;
start_counting(ARMV6_EVENT_DTLB_MISS, ARMV6_EVENT_MAIN_TLB_MISS) ;
for (i = iteration_count ; i > 0 ; i--) {
test_loop_prototype(array) ;
}
stop_counting() ;
}
//
// Handle command line arguments, bind the process to a
// particular CPU, run the test and write the results to
// the results file.
//
int main(int argc, char *argv[])
{
int arg_count ;
pid_t my_process_id ;
float cpu_clock ;
//
// Variables to hold the run parameters. These could be
// modified by command line arguments (if command line
// parsing is ever implemented. :-)
//
int iteration_count = TEST_ITERATIONS ;
//
// Variables for measuring expired CPU cycles
//
unsigned long long int before ; // Perf counter at start of test
unsigned long long int after ; // Oerf counter at end of test
unsigned long long int total_cycles ; // Cycles consumed by test
//
// Variables for computing expected results
//
unsigned long long int total_retires ; // Total retire events
unsigned long long int total_dtlb_misses ; // Total DTLB miss events
unsigned long long int total_dc_accesses ; // Total DC access events
unsigned long long int total_dc_misses ; // Total DC miss events
unsigned long long int expected_cycle_samples ; // Expected cycle samples
unsigned long long int expected_retire_samples ; // Expected ret samples
unsigned long long int expected_dc_access_samples ; // Exp DC access samples
unsigned long long int expected_dc_miss_samples ; // Exp DC miss samples
unsigned long long int expected_dtlb_miss_samples ; // Exp DTLB miss samples
#ifdef _DEBUG
d_flag = 1 ;
#endif
// Parse any command line arguments. Use simple one character
// option names preceded by a '-' character.
for (arg_count = 1 ; arg_count < argc ; arg_count++)
{
if( argv[arg_count][0] == '-' )
{
switch( argv[arg_count][1] )
{
case 'a' :
{
// Set processor core for array allocation and initialization
fprintf(stderr, "*warning* -a is UNIMPLEMENTED\n") ;
a_flag = 1 ;
arg_count++ ;
if (arg_count < argc)
{
initialization_core = atoi(argv[arg_count]) ;
}
break ;
}
case 'c' :
{
// Measure L1 data cache (DC) events
c_flag = 1 ;
break ;
}
case 'd' :
{
// Enable debug output
d_flag = 1 ;
break ;
}
case 'e' :
{
// Measure explicit external data access events
e_flag = 1 ;
break ;
}
case 'h' :
{
// Display usage information
display_usage_info(argv[0], usage_strings) ;
h_flag = 1 ;
break ;
}
case 'i' :
{
// Measure instructions per cycle (IPC)
i_flag = 1 ;
break ;
}
case 'l' :
{
// Set cache line size to N bytes
l_flag = 1 ;
arg_count++ ;
if (arg_count < argc)
{
line_size = LINE_SIZE ;
// line_size = atoi(argv[arg_count]) ;
// pointers_per_line = (line_size / pointer_size) ;
}
break ;
}
case 'n' :
{
// Set iteration count (number of tests)
n_flag = 1 ;
arg_count++ ;
if (arg_count < argc)
{
iteration_count = atoi(argv[arg_count]) ;
}
break ;
}
case 'p' :
{
// Set processor core for pointer chasing
fprintf(stderr, "*warning* -p is UNIMPLEMENTED\n") ;
p_flag = 1 ;
arg_count++ ;
if (arg_count < argc)
{
chase_core = atoi(argv[arg_count]) ;
}
break ;
}
case 's' :
{
// Set array size to N elements
s_flag = 1 ;
arg_count++ ;
if (arg_count < argc)
{
long int elements = ARRAY_ELEMENTS ;
elements = atoi(argv[arg_count]) ;
array_elements = elements ;
}
break ;
}
case 't' :
{
// Measure translation lookaside buffer (TLB) events
t_flag = 1 ;
break ;
}
case 'v' :
{
v_flag = 1 ;
break ;
}
default:
{
printf("Illegal option: %c\n", argv[arg_count][1]) ;
break ;
}
}
}
}
// Get ID of current process and CPU clock speed
my_process_id = getpid() ;
cpu_clock = 1000000.0f * (float)get_cpu_speed() ;
if (create_result_file(RESULT_FILE_NAME) == 0) {
exit( EXIT_FAILURE ) ;
}
// Allocate and initialize the pointer chasing array
initialize_array() ;
// Snap performance counter and TOD at start of test run
before = getTimeStamp() ;
// Perform pointer chasing
if (c_flag) {
run_cache(iteration_count) ;
} else if (e_flag) {
run_external(iteration_count) ;
} else if (i_flag) {
run_ipc(iteration_count) ;
} else if (t_flag) {
run_tlb(iteration_count) ;
} else {
run_no_events(iteration_count) ;
}
// Snap performance counter after the test loop
after = getTimeStamp() ;
total_cycles = after - before ;
if (d_flag) {
printf("*debug* Before: %16" FMT64 "d\n", before) ;
printf("*debug* After: %16" FMT64 "d\n", after) ;
}
// Display estimated results
print_heading("Pointer chasing test") ;
print_system_info() ;
fprintf(result_file,"Run information\n") ;
fprintf(result_file," PID: %16d\n", my_process_id) ;
fprintf(result_file," Allocation core: %16d\n", initialization_core) ;
fprintf(result_file," Chase core: %16d\n", chase_core) ;
fprintf(result_file," CPU clock speed (GHz): %16.4f\n", (float)(cpu_clock / 1000000000.0)) ;
fprintf(result_file," Number of test iterations: %16d\n", iteration_count) ;
fprintf(result_file," Array elements/cache lines: %16d\n", array_elements) ;
fprintf(result_file," Array size/bytes: %16d\n", array_elements * line_size) ;
fprintf(result_file," Cache line size: %16d\n", line_size) ;
fprintf(result_file," Pointer size: %16d\n", pointer_size) ;
fprintf(result_file," Pointers per line: %16d\n", pointers_per_line) ;
if (c_flag || e_flag || i_flag || t_flag) {
fprintf(result_file,"Performance Monitor events\n") ;
print_counts(result_file) ;
}
fprintf(result_file, "Run execution summary\n") ;
print_process_times() ;
print_elapsed_time() ;
close_result_file() ;
return( EXIT_SUCCESS ) ;
}