latency.c

//
// Memory access latency
//

//
// Author: P.J. Drongowski
// 8 June 2013
//
// Copyright (c) 2013 Paul. J. Drongowski
//

#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>

#include "../test_common/test_common.h"
#include "../test_common/rpi_pmu.h"

#define RESULT_FILE_NAME  "latency.txt"
#define TEST_ITERATIONS     1000
#define MAX_TEST_ITERATIONS 2000
int iteration_count = TEST_ITERATIONS ;

//
// Store latency samples in this file. Take one sample
// during each iteration of the test loop. 
//
#define SAMPLE_FILE_NAME  "samples.dat"
uint32_t samples[MAX_TEST_ITERATIONS] ;


//
// Default values for the pointer array and the cache
// line size. See chase.c for descriptions of these
// data structures.
//
#define ARRAY_ELEMENTS     (16 * 1024)
#define POINTER_SIZE       (sizeof(void*))
#define LINE_SIZE          32
#define POINTERS_PER_LINE  (LINE_SIZE / POINTER_SIZE)

typedef struct _CacheLine {
  struct _CacheLine* ptrCacheLine[POINTERS_PER_LINE] ;
} CacheLine ;

long int array_elements = ARRAY_ELEMENTS ;
long int line_size = LINE_SIZE ;
long int pointer_size = POINTER_SIZE ;
long int pointers_per_line = POINTERS_PER_LINE ;

CacheLine* array = NULL ;

//
// Command line options
//
//
// Run options (some not implemented)
//
int l_flag = 0 ;    // Set cache line size
int n_flag = 0 ;    // Set the number of tests (iteration count)
int s_flag = 0 ;    // Set array size (in units of 1024 elements)

#define OPTIONS "dhl:n:s:v"  // Currently unimplemented (getopt)

char *usage_strings[] = {
  "  -d    Display debugging information",
  "  -h    Display usage information",
  "  -l N  Set the cache line size to N bytes",
  "  -n N  Set the number of tests (iteration count)",
  "  -s N  Allocate an array with N * 1024 elements",
  "  -v    Display more information (verbose output)",
  NULL
} ;

//
// Allocate and initialize the test array
//
void initialize_array()
{
  long int steps ;   // Number of initialization setps
  long int top ;     // Index of top cache line
  long int bottom ;  // Index of bottom cache line
  long int i ;       // Array index counter
  int j ;            // Pointer index counter

  if ((array = (CacheLine*) malloc(array_elements * line_size)) == NULL) {
    fprintf(stderr, "Couldn't allocate test array\n") ;
    exit( EXIT_FAILURE ) ;
  }

  //
  // Set the entire array to zero.
  //
  for (i = 0 ; i < array_elements ; i++) {
    for (j = 0 ; j < pointers_per_line ; j++) {
      array[i].ptrCacheLine[j] = NULL ;
    }
  }

  //
  // The test array is divided into two halves: top and
  // bottom. Set the pointers
  // so that memory accesses ping-pong between the two
  // halves of the array. Ping-pong between cache lines.
  //
  steps = array_elements / 2 ;
  bottom = 0 ;
  top = steps ;
  for (i = 0 ; i < steps ; i++) {
    array[bottom].ptrCacheLine[0] = &array[top] ;
    array[top].ptrCacheLine[0] = &array[bottom+1] ;
    top++ ;
    bottom++ ;
  }

  //
  // Terminate the linked list
  //
  array[(array_elements-1)].ptrCacheLine[0] = NULL ;

  if (d_flag) {
    for (i = 0 ; i < array_elements ; i++) {
      // fprintf(result_file, "%16" FMT64 "x\n", array[i].ptrCacheLine[0]) ;
      fprintf(result_file, "%d  %8lx  %8lx\n",
	      i, &array[i].ptrCacheLine[0], array[i].ptrCacheLine[0]) ;
    }
  }
}

static inline uint32_t armv6_read_ccr()
{
  uint32_t value ;
  asm volatile("mrc   p15, 0, %0, c15, c12, 1" : "=r"(value)) ;
  return( value ) ;
}

uint32_t latency_cycles(CacheLine* linked_list)
{
  register CacheLine* item ;
  register uint32_t before, after, cycles ;

  cycles = 0 ;
  for(item = linked_list ; item != NULL ; ) {
    before = armv6_read_ccr() ;
    item = item->ptrCacheLine[0] ;
    after = armv6_read_ccr() ;
    cycles += after - before ;
  }

  return( cycles ) ;
}

uint32_t measure_latency_cycles()
{
  uint32_t cycles = 0 ;

  // Clear the sticky overflow bits
  armv6_pmcr_write(ARMV6_PMCR_CCOUNT_OVERFLOW |
                   ARMV6_PMCR_COUNT0_OVERFLOW |
                   ARMV6_PMCR_COUNT1_OVERFLOW
		   ) ;

  // Clear and start the performance counters
  armv6_pmcr_write(ARMV6_PMCR_ENABLE |
		   ARMV6_PMCR_CCOUNT_RESET |
		   ARMV6_PMCR_CTR01_RESET |
		   (0xFF << ARMV6_PMCR_EVT_COUNT0_SHIFT) |
		   (0xFF << ARMV6_PMCR_EVT_COUNT1_SHIFT)
		   ) ;

  cycles = latency_cycles(array) ;

  // Stop the performance counters
  armv6_pmcr_write(0) ;

  return( cycles ) ;
}

uint32_t sample_cycles(CacheLine* linked_list, int n)
{
  register CacheLine* item ;
  register uint32_t before, after, cycles ;
  register count = n ;

  cycles = 0 ;
  for(item = linked_list ; item != NULL ; ) {
    if (count-- == 0) {
      before = armv6_read_ccr() ;
      item = item->ptrCacheLine[0] ;
      after = armv6_read_ccr() ;
      cycles = after - before ;
    } else {
      item = item->ptrCacheLine[0] ;
    }
  }

  return( cycles ) ;
}

//
// Run the pointer chasing loop multiple times and
// take one latency sample per iteration.
//
// Compile with -O2 to inline the MCR/MRC instructions!
//
void sample_latency_cycles()
{
  register int i ;

  // Clear the sticky overflow bits
  armv6_pmcr_write(ARMV6_PMCR_CCOUNT_OVERFLOW |
                   ARMV6_PMCR_COUNT0_OVERFLOW |
                   ARMV6_PMCR_COUNT1_OVERFLOW
		   ) ;

  for(i = iteration_count-1 ; i >= 0 ; i--) {
    // Clear and start the performance counters
    armv6_pmcr_write(ARMV6_PMCR_ENABLE |
		     ARMV6_PMCR_CCOUNT_RESET |
		     ARMV6_PMCR_CTR01_RESET |
		     (0xFF << ARMV6_PMCR_EVT_COUNT0_SHIFT) |
		     (0xFF << ARMV6_PMCR_EVT_COUNT1_SHIFT)
		     ) ;

    // Kind of a bogus "random" sampling within a 64
    // iteration window during list traversal
    samples[i] = sample_cycles(array, 187+(i&0x3F) ) ;

    // Stop the performance counters
    armv6_pmcr_write(0) ;
  }
}

//
// Write the number of samples and the latency
// samples to a file.
//
void write_samples(char* filename)
{
  int i ;
  FILE* sample_file = NULL ;

  if ((sample_file = fopen(filename, "w")) == NULL) {
    fprintf(stderr, "Couldn't create sample file\n") ;
    return ;
  }

  // The first line is the number of samples
  fprintf(sample_file, "%ld\n", iteration_count) ;

  for (i = 0 ; i < iteration_count ; i++) {
    fprintf(sample_file, "%ld\n", samples[i]) ;
  }

  fclose(sample_file) ;
}


int main(int argc, char* argv[])
{
  int arg_count ;
  uint32_t cycles = 0 ;
  double latency = 0.0 ;
#ifdef _DEBUG
  d_flag = 1 ;
#endif


  // Parse any command line arguments. Use simple one character
  // option names preceded by a '-' character.
  for (arg_count = 1 ; arg_count < argc ; arg_count++)
  {
    if( argv[arg_count][0] == '-' )
    {
      switch( argv[arg_count][1] )
      {
      case 'd' :
        {
          // Enable debug output
          d_flag = 1 ;
          break ;
        }
      case 'h' :
        {
          // Display usage information
          display_usage_info(argv[0], usage_strings) ;
          h_flag = 1 ;
          break ;
        }
      case 'l' :
        {
          // Set cache line size to N bytes
          l_flag = 1 ;
          arg_count++ ;
          if (arg_count < argc)
          {
            line_size = LINE_SIZE ;
            // line_size = atoi(argv[arg_count]) ;
            // pointers_per_line = (line_size / pointer_size) ;
          }
          break ;
        }
      case 'n' :
        {
          // Set iteration count (number of tests)
          n_flag = 1 ;
          arg_count++ ;
          if (arg_count < argc)
          {
            iteration_count = atoi(argv[arg_count]) ;
	    if (iteration_count > MAX_TEST_ITERATIONS) {
	      iteration_count = MAX_TEST_ITERATIONS ;
	    }
          }
          break ;
        }
      case 's' :
        {
          // Set array size to N elements
          s_flag = 1 ;
          arg_count++ ;
          if (arg_count < argc)
          {
            long int elements = ARRAY_ELEMENTS ;
            elements = atoi(argv[arg_count]) ;
	    if (elements < 256) elements = 256 ;
            array_elements = elements ;
          }
          break ;
        }
      case 'v' :
        {
          v_flag = 1 ;
          break ;
        }
      default:
        {
          printf("Illegal option: %c\n", argv[arg_count][1]) ;
          break ;
        }
      }
    }
  }

  // Create the result file
  if (create_result_file(RESULT_FILE_NAME) == 0) {
    exit( EXIT_FAILURE ) ;
  }

  // Allocate and initialize the linked list
  initialize_array() ;

  // Measure latency cycles to compute an average
  // The first call is needed to warm up the cache
  cycles = measure_latency_cycles() ;
  cycles = measure_latency_cycles() ;

  // Collect and write latency cycles to the
  // samples file
  sample_latency_cycles() ;
  write_samples(SAMPLE_FILE_NAME) ;

  // Display estimated results
  print_heading("Memory access latency test") ;

  print_system_info() ;

  fprintf(result_file,"Run information\n") ;
  fprintf(result_file,"  Number of test iterations:  %16d\n", iteration_count) ;
  fprintf(result_file,"  List items/cache lines:     %16d\n", array_elements) ;
  fprintf(result_file,"  Array size/bytes:           %16d\n", array_elements * line_size) ;
  fprintf(result_file,"  Cache line size:            %16d\n", line_size) ;
  fprintf(result_file,"  Pointer size:               %16d\n", pointer_size) ;
  fprintf(result_file,"  Pointers per line:          %16d\n", pointers_per_line) ;

  // Compute and print the average latency using the
  // total number of latency cycles. This is NOT the
  // most accurate method due to statistical outliers.
  // Subtract off a six cycle measurement bias.
  latency = ((double)cycles / (double)array_elements) - 6.0 ;
  fprintf(result_file, "Statistics\n") ;
  fprintf(result_file, "  Cycles:  %ld\n", cycles) ;
  fprintf(result_file, "  Latency: %6.3f\n", latency) ;

  fprintf(result_file, "Run execution summary\n") ;
  print_process_times() ;
  print_elapsed_time() ;

  close_result_file() ;
  return( EXIT_SUCCESS ) ;
}