//
// Memory access latency
//
//
// Author: P.J. Drongowski
// 8 June 2013
//
// Copyright (c) 2013 Paul. J. Drongowski
//
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include "../test_common/test_common.h"
#include "../test_common/rpi_pmu.h"
#define RESULT_FILE_NAME "latency.txt"
#define TEST_ITERATIONS 1000
#define MAX_TEST_ITERATIONS 2000
int iteration_count = TEST_ITERATIONS ;
//
// Store latency samples in this file. Take one sample
// during each iteration of the test loop.
//
#define SAMPLE_FILE_NAME "samples.dat"
uint32_t samples[MAX_TEST_ITERATIONS] ;
//
// Default values for the pointer array and the cache
// line size. See chase.c for descriptions of these
// data structures.
//
#define ARRAY_ELEMENTS (16 * 1024)
#define POINTER_SIZE (sizeof(void*))
#define LINE_SIZE 32
#define POINTERS_PER_LINE (LINE_SIZE / POINTER_SIZE)
typedef struct _CacheLine {
struct _CacheLine* ptrCacheLine[POINTERS_PER_LINE] ;
} CacheLine ;
long int array_elements = ARRAY_ELEMENTS ;
long int line_size = LINE_SIZE ;
long int pointer_size = POINTER_SIZE ;
long int pointers_per_line = POINTERS_PER_LINE ;
CacheLine* array = NULL ;
//
// Command line options
//
//
// Run options (some not implemented)
//
int l_flag = 0 ; // Set cache line size
int n_flag = 0 ; // Set the number of tests (iteration count)
int s_flag = 0 ; // Set array size (in units of 1024 elements)
#define OPTIONS "dhl:n:s:v" // Currently unimplemented (getopt)
char *usage_strings[] = {
" -d Display debugging information",
" -h Display usage information",
" -l N Set the cache line size to N bytes",
" -n N Set the number of tests (iteration count)",
" -s N Allocate an array with N * 1024 elements",
" -v Display more information (verbose output)",
NULL
} ;
//
// Allocate and initialize the test array
//
void initialize_array()
{
long int steps ; // Number of initialization setps
long int top ; // Index of top cache line
long int bottom ; // Index of bottom cache line
long int i ; // Array index counter
int j ; // Pointer index counter
if ((array = (CacheLine*) malloc(array_elements * line_size)) == NULL) {
fprintf(stderr, "Couldn't allocate test array\n") ;
exit( EXIT_FAILURE ) ;
}
//
// Set the entire array to zero.
//
for (i = 0 ; i < array_elements ; i++) {
for (j = 0 ; j < pointers_per_line ; j++) {
array[i].ptrCacheLine[j] = NULL ;
}
}
//
// The test array is divided into two halves: top and
// bottom. Set the pointers
// so that memory accesses ping-pong between the two
// halves of the array. Ping-pong between cache lines.
//
steps = array_elements / 2 ;
bottom = 0 ;
top = steps ;
for (i = 0 ; i < steps ; i++) {
array[bottom].ptrCacheLine[0] = &array[top] ;
array[top].ptrCacheLine[0] = &array[bottom+1] ;
top++ ;
bottom++ ;
}
//
// Terminate the linked list
//
array[(array_elements-1)].ptrCacheLine[0] = NULL ;
if (d_flag) {
for (i = 0 ; i < array_elements ; i++) {
// fprintf(result_file, "%16" FMT64 "x\n", array[i].ptrCacheLine[0]) ;
fprintf(result_file, "%d %8lx %8lx\n",
i, &array[i].ptrCacheLine[0], array[i].ptrCacheLine[0]) ;
}
}
}
static inline uint32_t armv6_read_ccr()
{
uint32_t value ;
asm volatile("mrc p15, 0, %0, c15, c12, 1" : "=r"(value)) ;
return( value ) ;
}
uint32_t latency_cycles(CacheLine* linked_list)
{
register CacheLine* item ;
register uint32_t before, after, cycles ;
cycles = 0 ;
for(item = linked_list ; item != NULL ; ) {
before = armv6_read_ccr() ;
item = item->ptrCacheLine[0] ;
after = armv6_read_ccr() ;
cycles += after - before ;
}
return( cycles ) ;
}
uint32_t measure_latency_cycles()
{
uint32_t cycles = 0 ;
// Clear the sticky overflow bits
armv6_pmcr_write(ARMV6_PMCR_CCOUNT_OVERFLOW |
ARMV6_PMCR_COUNT0_OVERFLOW |
ARMV6_PMCR_COUNT1_OVERFLOW
) ;
// Clear and start the performance counters
armv6_pmcr_write(ARMV6_PMCR_ENABLE |
ARMV6_PMCR_CCOUNT_RESET |
ARMV6_PMCR_CTR01_RESET |
(0xFF << ARMV6_PMCR_EVT_COUNT0_SHIFT) |
(0xFF << ARMV6_PMCR_EVT_COUNT1_SHIFT)
) ;
cycles = latency_cycles(array) ;
// Stop the performance counters
armv6_pmcr_write(0) ;
return( cycles ) ;
}
uint32_t sample_cycles(CacheLine* linked_list, int n)
{
register CacheLine* item ;
register uint32_t before, after, cycles ;
register count = n ;
cycles = 0 ;
for(item = linked_list ; item != NULL ; ) {
if (count-- == 0) {
before = armv6_read_ccr() ;
item = item->ptrCacheLine[0] ;
after = armv6_read_ccr() ;
cycles = after - before ;
} else {
item = item->ptrCacheLine[0] ;
}
}
return( cycles ) ;
}
//
// Run the pointer chasing loop multiple times and
// take one latency sample per iteration.
//
// Compile with -O2 to inline the MCR/MRC instructions!
//
void sample_latency_cycles()
{
register int i ;
// Clear the sticky overflow bits
armv6_pmcr_write(ARMV6_PMCR_CCOUNT_OVERFLOW |
ARMV6_PMCR_COUNT0_OVERFLOW |
ARMV6_PMCR_COUNT1_OVERFLOW
) ;
for(i = iteration_count-1 ; i >= 0 ; i--) {
// Clear and start the performance counters
armv6_pmcr_write(ARMV6_PMCR_ENABLE |
ARMV6_PMCR_CCOUNT_RESET |
ARMV6_PMCR_CTR01_RESET |
(0xFF << ARMV6_PMCR_EVT_COUNT0_SHIFT) |
(0xFF << ARMV6_PMCR_EVT_COUNT1_SHIFT)
) ;
// Kind of a bogus "random" sampling within a 64
// iteration window during list traversal
samples[i] = sample_cycles(array, 187+(i&0x3F) ) ;
// Stop the performance counters
armv6_pmcr_write(0) ;
}
}
//
// Write the number of samples and the latency
// samples to a file.
//
void write_samples(char* filename)
{
int i ;
FILE* sample_file = NULL ;
if ((sample_file = fopen(filename, "w")) == NULL) {
fprintf(stderr, "Couldn't create sample file\n") ;
return ;
}
// The first line is the number of samples
fprintf(sample_file, "%ld\n", iteration_count) ;
for (i = 0 ; i < iteration_count ; i++) {
fprintf(sample_file, "%ld\n", samples[i]) ;
}
fclose(sample_file) ;
}
int main(int argc, char* argv[])
{
int arg_count ;
uint32_t cycles = 0 ;
double latency = 0.0 ;
#ifdef _DEBUG
d_flag = 1 ;
#endif
// Parse any command line arguments. Use simple one character
// option names preceded by a '-' character.
for (arg_count = 1 ; arg_count < argc ; arg_count++)
{
if( argv[arg_count][0] == '-' )
{
switch( argv[arg_count][1] )
{
case 'd' :
{
// Enable debug output
d_flag = 1 ;
break ;
}
case 'h' :
{
// Display usage information
display_usage_info(argv[0], usage_strings) ;
h_flag = 1 ;
break ;
}
case 'l' :
{
// Set cache line size to N bytes
l_flag = 1 ;
arg_count++ ;
if (arg_count < argc)
{
line_size = LINE_SIZE ;
// line_size = atoi(argv[arg_count]) ;
// pointers_per_line = (line_size / pointer_size) ;
}
break ;
}
case 'n' :
{
// Set iteration count (number of tests)
n_flag = 1 ;
arg_count++ ;
if (arg_count < argc)
{
iteration_count = atoi(argv[arg_count]) ;
if (iteration_count > MAX_TEST_ITERATIONS) {
iteration_count = MAX_TEST_ITERATIONS ;
}
}
break ;
}
case 's' :
{
// Set array size to N elements
s_flag = 1 ;
arg_count++ ;
if (arg_count < argc)
{
long int elements = ARRAY_ELEMENTS ;
elements = atoi(argv[arg_count]) ;
if (elements < 256) elements = 256 ;
array_elements = elements ;
}
break ;
}
case 'v' :
{
v_flag = 1 ;
break ;
}
default:
{
printf("Illegal option: %c\n", argv[arg_count][1]) ;
break ;
}
}
}
}
// Create the result file
if (create_result_file(RESULT_FILE_NAME) == 0) {
exit( EXIT_FAILURE ) ;
}
// Allocate and initialize the linked list
initialize_array() ;
// Measure latency cycles to compute an average
// The first call is needed to warm up the cache
cycles = measure_latency_cycles() ;
cycles = measure_latency_cycles() ;
// Collect and write latency cycles to the
// samples file
sample_latency_cycles() ;
write_samples(SAMPLE_FILE_NAME) ;
// Display estimated results
print_heading("Memory access latency test") ;
print_system_info() ;
fprintf(result_file,"Run information\n") ;
fprintf(result_file," Number of test iterations: %16d\n", iteration_count) ;
fprintf(result_file," List items/cache lines: %16d\n", array_elements) ;
fprintf(result_file," Array size/bytes: %16d\n", array_elements * line_size) ;
fprintf(result_file," Cache line size: %16d\n", line_size) ;
fprintf(result_file," Pointer size: %16d\n", pointer_size) ;
fprintf(result_file," Pointers per line: %16d\n", pointers_per_line) ;
// Compute and print the average latency using the
// total number of latency cycles. This is NOT the
// most accurate method due to statistical outliers.
// Subtract off a six cycle measurement bias.
latency = ((double)cycles / (double)array_elements) - 6.0 ;
fprintf(result_file, "Statistics\n") ;
fprintf(result_file, " Cycles: %ld\n", cycles) ;
fprintf(result_file, " Latency: %6.3f\n", latency) ;
fprintf(result_file, "Run execution summary\n") ;
print_process_times() ;
print_elapsed_time() ;
close_result_file() ;
return( EXIT_SUCCESS ) ;
}