chase.c

//
// Pointer chasing
//

//
// Author: P.J. Drongowski
// 2 June 2013
//
// Copyright (c) 2013 Paul. J. Drongowski
//

#include <sys/types.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>

#include "../test_common/rpi_pmu.h"
#include "../test_common/test_common.h"

#define RESULT_FILE_NAME  "chase.txt"

#define TEST_ITERATIONS   8192

//
// Default values for the pointer array and the cache line size
//
#define ARRAY_ELEMENTS     (16 * 1024)
#define POINTER_SIZE       (sizeof(void*))
#define LINE_SIZE          32
#define POINTERS_PER_LINE  (LINE_SIZE / POINTER_SIZE)

typedef struct _CacheLine
{
	struct _CacheLine* ptrCacheLine[POINTERS_PER_LINE] ;
} CacheLine ;

long int array_elements = ARRAY_ELEMENTS ;
long int line_size = LINE_SIZE ;
long int pointer_size = POINTER_SIZE ;
long int pointers_per_line = POINTERS_PER_LINE ;

//
// The program allocates an array of CacheLine structures. The number
// of elements (cache lines) is determined by the variable array_elements.
// The array is initialized such that it forms a linked list of elements.
// Elements in the upper half of the array point to elements in the
// lower half, and vice versa. Thus, access will ping-pong between
// the upper and lower halves when the linked list is walked (pointers
// are chased.)
//
CacheLine* array = NULL ;

//
// The program has two phases: initialization and pointer chasing.
// The process-to-core affinity is set at the beginning of each phase.
// Windows uses a first touch physical allocation scheme, so it's
// possible to force physical allocation of the pointer array to a
// particular core and to force chasing on the same or different core.
// This should support NUMA experiments.
//
int initialization_core = 0 ;  // Core (CPU) for array allocation/initialization
int chase_core = 0 ;           // Core (CPU) for pointer chasing

//
// Run options (some not implemented)
//
int a_flag = 0 ;    // Set core for array allocation/initialization
int c_flag = 0 ;    // Measure L1 data cache events
int e_flag = 0 ;    // Measure explicit external data access events
int i_flag = 0 ;    // Measure instruction and cycle events
int l_flag = 0 ;    // Set cache line size
int n_flag = 0 ;    // Set the number of tests (iterattion count)
int p_flag = 0 ;    // Set core for pointer chasing
int s_flag = 0 ;    // Set array size (in units of 1024 elements)
int t_flag = 0 ;    // Measure TLB events

#define OPTIONS "dhi:v"  // Currently unimplemented (getopt)

char *usage_strings[] = {
  "  -a N  Allocate/initialize array on core N",
  "  -c    Measure L1 data cache events",
  "  -d    Display debugging information",
  "  -e    Measure explicit external data access events",
  "  -h    Display usage information",
  "  -i    Measure instruction and cycle events",
  "  -l N  Set the cache line size to N bytes",
  "  -n N  Set the number of tests (iteration count)",
  "  -p N  Perform pointer chasing on core N",
  "  -s N  Allocate an array with N * 1024 elements",
  "  -t    Measure TLB events",
  "  -v    Display more information (verbose output)",
  NULL
} ;

//
// Allocate and initialize the test array
//
void initialize_array()
{
  long int steps ;   // Number of initialization setps
  long int top ;     // Index of top cache line
  long int bottom ;  // Index of bottom cache line
  long int i ;       // Array index counter
  int j ;            // Pointer index counter

  if ((array = (CacheLine*) malloc(array_elements * line_size)) == NULL) {
    fprintf(stderr, "Couldn't allocate test array\n") ;
    exit( EXIT_FAILURE ) ;
  }

  //
  // Set the entire array to zero.
  //
  for (i = 0 ; i < array_elements ; i++) {
    for (j = 0 ; j < pointers_per_line ; j++) {
      array[i].ptrCacheLine[j] = NULL ;
    }
  }

  //
  // The test array is divided into two halves: top and
  // bottom. Set the pointers
  // so that memory accesses ping-pong between the two
  // halves of the array. Ping-pong between cache lines.
  //
  steps = array_elements / 2 ;
  bottom = 0 ;
  top = steps ;
  for (i = 0 ; i < steps ; i++) {
    array[bottom].ptrCacheLine[0] = &array[top] ;
    array[top].ptrCacheLine[0] = &array[bottom+1] ;
    top++ ;
    bottom++ ;
  }

  //
  // Terminate the linked list
  //
  array[(array_elements-1)].ptrCacheLine[0] = NULL ;

  if (d_flag) {
    for (i = 0 ; i < array_elements ; i++) {
      // fprintf(result_file, "%16" FMT64 "x\n", array[i].ptrCacheLine[0]) ;
      fprintf(result_file, "%d  %8lx  %8lx\n",
	      i, &array[i].ptrCacheLine[0], array[i].ptrCacheLine[0]) ;
    }
  }
}


//
// Examine the assembler code generated for test_loop_prototype
// to guide implementation of the assembly language test code
//
void test_loop_prototype(CacheLine* array)
{
  CacheLine* p ;
  for(p = array ; p != NULL ; p = p->ptrCacheLine[0]) ;
}


//
// Perform the test itself. Since all of the test-related
// activity starts in this function, it should be easier to
// track samples back to the test itself.
//
void run_no_events(int iteration_count)
{
  int i ;
  for (i = iteration_count ; i > 0 ; i--) {
    test_loop_prototype(array) ;
  }
}

void run_external(int iteration_count)
{
  int i ;
  start_counting(ARMV6_EVENT_EXPL_D_ACCESS, ARMV6_EVENT_DDEP_STALL) ;
  for (i = iteration_count ; i > 0 ; i--) {
    test_loop_prototype(array) ;
  }
  stop_counting() ;
}

void run_ipc(int iteration_count)
{
  int i ;
  start_counting(ARMV6_EVENT_INSTR_EXEC, ARMV6_EVENT_CPU_CYCLES) ;
  for (i = iteration_count ; i > 0 ; i--) {
    test_loop_prototype(array) ;
  }
  stop_counting() ;
}

void run_cache(int iteration_count)
{
  int i ;
  start_counting(ARMV6_EVENT_DCACHE_CACCESS, ARMV6_EVENT_DCACHE_MISS) ;
  for (i = iteration_count ; i > 0 ; i--) {
    test_loop_prototype(array) ;
  }
  stop_counting() ;
}

void run_tlb(int iteration_count)
{
  int i ;
  start_counting(ARMV6_EVENT_DTLB_MISS, ARMV6_EVENT_MAIN_TLB_MISS) ;
  for (i = iteration_count ; i > 0 ; i--) {
    test_loop_prototype(array) ;
  }
  stop_counting() ;
}


//
// Handle command line arguments, bind the process to a
// particular CPU, run the test and write the results to
// the results file.
//
int main(int argc, char *argv[])
{
  int arg_count ;
  pid_t my_process_id ;
  float cpu_clock ;

  //
  // Variables to hold the run parameters. These could be
  // modified by command line arguments (if command line
  // parsing is ever implemented. :-)
  //
  int iteration_count = TEST_ITERATIONS ;

  //
  // Variables for measuring expired CPU cycles
  // 
  unsigned long long int before ;              // Perf counter at start of test
  unsigned long long int after ;               // Oerf counter at end of test
  unsigned long long int total_cycles ;        // Cycles consumed by test

  //
  // Variables for computing expected results
  //
  unsigned long long int total_retires ;              // Total retire events
  unsigned long long int total_dtlb_misses ;          // Total DTLB miss events
  unsigned long long int total_dc_accesses ;          // Total DC access events
  unsigned long long int total_dc_misses ;            // Total DC miss events
  unsigned long long int expected_cycle_samples ;     // Expected cycle samples
  unsigned long long int expected_retire_samples ;    // Expected ret samples
  unsigned long long int expected_dc_access_samples ; // Exp DC access samples
  unsigned long long int expected_dc_miss_samples ;   // Exp DC miss samples
  unsigned long long int expected_dtlb_miss_samples ; // Exp DTLB miss samples

#ifdef _DEBUG
  d_flag = 1 ;
#endif

  // Parse any command line arguments. Use simple one character
  // option names preceded by a '-' character.
  for (arg_count = 1 ; arg_count < argc ; arg_count++)
  {
    if( argv[arg_count][0] == '-' )
    {
      switch( argv[arg_count][1] )
      {
      case 'a' :
        {
          // Set processor core for array allocation and initialization 
	  fprintf(stderr, "*warning* -a is UNIMPLEMENTED\n") ;
          a_flag = 1 ;
          arg_count++ ;
          if (arg_count < argc)
          {
            initialization_core = atoi(argv[arg_count]) ;
          }
          break ;
        }
      case 'c' :
        {
          // Measure L1 data cache (DC) events
          c_flag = 1 ;
          break ;
        }
      case 'd' :
        {
          // Enable debug output
          d_flag = 1 ;
          break ;
        }
      case 'e' :
        {
          // Measure explicit external data access events
          e_flag = 1 ;
          break ;
        }
      case 'h' :
        {
          // Display usage information
          display_usage_info(argv[0], usage_strings) ;
          h_flag = 1 ;
          break ;
        }
      case 'i' :
        {
          // Measure instructions per cycle (IPC)
          i_flag = 1 ;
          break ;
        }
      case 'l' :
        {
          // Set cache line size to N bytes
          l_flag = 1 ;
          arg_count++ ;
          if (arg_count < argc)
          {
            line_size = LINE_SIZE ;
            // line_size = atoi(argv[arg_count]) ;
            // pointers_per_line = (line_size / pointer_size) ;
          }
          break ;
        }
      case 'n' :
        {
          // Set iteration count (number of tests)
          n_flag = 1 ;
          arg_count++ ;
          if (arg_count < argc)
          {
            iteration_count = atoi(argv[arg_count]) ;
          }
          break ;
        }
      case 'p' :
        {
          // Set processor core for pointer chasing 
	  fprintf(stderr, "*warning* -p is UNIMPLEMENTED\n") ;
          p_flag = 1 ;
          arg_count++ ;
          if (arg_count < argc)
          {
            chase_core = atoi(argv[arg_count]) ;
          }
          break ;
        }
      case 's' :
        {
          // Set array size to N elements
          s_flag = 1 ;
          arg_count++ ;
          if (arg_count < argc)
          {
            long int elements = ARRAY_ELEMENTS ;
            elements = atoi(argv[arg_count]) ;
            array_elements = elements ;
          }
          break ;
        }
      case 't' :
        {
          // Measure translation lookaside buffer (TLB) events
          t_flag = 1 ;
          break ;
        }
      case 'v' :
        {
          v_flag = 1 ;
          break ;
        }
      default:
        {
          printf("Illegal option: %c\n", argv[arg_count][1]) ;
          break ;
        }
      }
    }
  }

  // Get ID of current process and CPU clock speed
  my_process_id = getpid() ;
  cpu_clock = 1000000.0f * (float)get_cpu_speed() ;

  if (create_result_file(RESULT_FILE_NAME) == 0) {
    exit( EXIT_FAILURE ) ;
  }

  // Allocate and initialize the pointer chasing array
  initialize_array() ;

  // Snap performance counter and TOD at start of test run
  before = getTimeStamp() ;

  // Perform pointer chasing
  if (c_flag) {
    run_cache(iteration_count) ;
  } else if (e_flag) {
    run_external(iteration_count) ;
  } else if (i_flag) {
    run_ipc(iteration_count) ;
  } else if (t_flag) {
    run_tlb(iteration_count) ;
  } else {
    run_no_events(iteration_count) ;
  }

  // Snap performance counter after the test loop
  after = getTimeStamp() ;
  total_cycles = after - before ;

  if (d_flag) {
    printf("*debug* Before: %16" FMT64 "d\n", before) ;
    printf("*debug* After:  %16" FMT64 "d\n", after) ;
  }

  // Display estimated results
  print_heading("Pointer chasing test") ;

  print_system_info() ;
  fprintf(result_file,"Run information\n") ;
  fprintf(result_file,"  PID:                        %16d\n", my_process_id) ;
  fprintf(result_file,"  Allocation core:            %16d\n", initialization_core) ;
  fprintf(result_file,"  Chase core:                 %16d\n", chase_core) ;
  fprintf(result_file,"  CPU clock speed (GHz):      %16.4f\n", (float)(cpu_clock / 1000000000.0)) ;
  fprintf(result_file,"  Number of test iterations:  %16d\n", iteration_count) ;
  fprintf(result_file,"  Array elements/cache lines: %16d\n", array_elements) ;
  fprintf(result_file,"  Array size/bytes:           %16d\n", array_elements * line_size) ;
  fprintf(result_file,"  Cache line size:            %16d\n", line_size) ;
  fprintf(result_file,"  Pointer size:               %16d\n", pointer_size) ;
  fprintf(result_file,"  Pointers per line:          %16d\n", pointers_per_line) ;

  if (c_flag || e_flag || i_flag || t_flag) {
    fprintf(result_file,"Performance Monitor events\n") ;
    print_counts(result_file) ;
  }

  fprintf(result_file, "Run execution summary\n") ;
  print_process_times() ;
  print_elapsed_time() ;

  close_result_file() ;
  return( EXIT_SUCCESS ) ;
}