interchange.c

//
// Loop nest interchange matrix multiplication
//

//
// Author:  P.J. Drongowski
// Date:    18 March 2015
//
// Copyright (c) 2013-2015 Paul J. Drongowski
//

#include <stdlib.h>
#include <stdio.h>

#include "../test_common/test_common.h"
#include "../test_common/rpi_pmu.h"

#define RESULT_FILE_NAME  "interchange.txt"

#define MSIZE 500
int matrix_size_count = MSIZE ;

float matrix_a[MSIZE][MSIZE] ;
float matrix_b[MSIZE][MSIZE] ;
float matrix_r[MSIZE][MSIZE] ;

//
// Run options (some not implemented)
//
int c_flag = 0 ;    // Measure cache behavior
int i_flag = 0 ;    // Measure IPC
int m_flag = 0 ;    // Set matrix size to N
int t_flag = 0 ;    // Measure TLB behavior

#define OPTIONS "cdhim:tv"  // Currently unimplemented (getopt)

char *usage_strings[] = {
  "  -c    Measure cache behavior",
  "  -d    Display debugging information",
  "  -h    Display usage information",
  "  -i    Measure IPC",
  "  -m N  Set the matrix dimensions to N elements",
  "  -t    Measure TLB behavior",
  "  -v    Display more information (verbose output)",
  NULL
} ;

void initialize_matrices()
{
  int i, j ;

  for (i = 0 ; i < MSIZE ; i++) {
    for (j = 0 ; j < MSIZE ; j++) {
      matrix_a[i][j] = (float) rand() / RAND_MAX ;
      matrix_b[i][j] = (float) rand() / RAND_MAX ;
             matrix_r[i][j] = 0.0 ;
    }
  }
}

void multiply_matrices()
{
  int i, j, k ;

    // Multiply the two matrices
    //
    // Please note that the nesting of the innermost
    // loops has been changed. The index variables j
    // and k change the most frequently and the access
    // pattern through the operand matrices is sequential
    // using a small stride (one.) This changes improves
    // access to memory data through the data cache. Data
    // translation lookaside buffer (DTLB) behavior is
    // also improved.
    for (i = 0 ; i < MSIZE ; i++) {
	for (k = 0 ; k < MSIZE ; k++) {
	    for (j = 0 ; j < MSIZE ; j++) {
		matrix_r[i][j] = matrix_r[i][j] +
		  (matrix_a[i][k] * matrix_b[k][j]) ;
	    }
        }
    }
}

void measure_cpi()
{
  initialize_matrices() ;

  start_counting(ARMV6_EVENT_INSTR_EXEC, ARMV6_EVENT_CPU_CYCLES) ;

  multiply_matrices() ;

  stop_counting() ;
}

void measure_cache()
{
  initialize_matrices() ;

  start_counting(ARMV6_EVENT_DCACHE_CACCESS, ARMV6_EVENT_DCACHE_MISS) ;

  multiply_matrices() ;

  stop_counting() ;
}

void measure_tlb()
{
  initialize_matrices() ;

  start_counting(ARMV6_EVENT_DTLB_MISS, ARMV6_EVENT_MAIN_TLB_MISS) ;

  multiply_matrices() ;

  stop_counting() ;
}

void run_no_events()
{
  initialize_matrices() ;
  multiply_matrices() ;
}

int main(int argc, char* argv[])
{
  int arg_count ;
  pid_t my_process_id ;
  unsigned long outer, middle, inner, total ;
  unsigned long long cycles ;
  unsigned int cpu_speed ;
  double elapsed = 0 ;

#ifdef _DEBUG
  d_flag = 1 ;
#endif

  // Parse any command line arguments. Use simple one character
  // option names preceded by a '-' character.
  for (arg_count = 1 ; arg_count < argc ; arg_count++)
  {
    if( argv[arg_count][0] == '-' )
    {
      switch( argv[arg_count][1] )
      {
      case 'c' :
        {
          // Measure data cache behavior
          c_flag = 1 ;
          break ;
        }
      case 'd' :
        {
          // Enable debug output
          d_flag = 1 ;
          break ;
        }
      case 'h' :
        {
          // Display usage information
          display_usage_info(argv[0], usage_strings) ;
          h_flag = 1 ;
          break ;
        }
      case 'i' :
        {
          // Measure IPC
          i_flag = 1 ;
          break ;
        }
      case 'm' :
        {
          // Set matrix size
	  fprintf(stderr, "*warning* -m is unimplemented\n") ;
          m_flag = 1 ;
          arg_count++ ;
          if (arg_count < argc)
          {
            matrix_size_count = atoi(argv[arg_count]) ;
          }
          break ;
        }
      case 't' :
        {
          // Measure TLB behavior
          t_flag = 1 ;
          break ;
        }
      case 'v' :
        {
          v_flag = 1 ;
          break ;
        }
      default:
        {
          printf("Illegal option: %c\n", argv[arg_count][1]) ;
          break ;
        }
      }
    }
  }

  if (create_result_file(RESULT_FILE_NAME) == 0) {
    exit( EXIT_FAILURE ) ;
  }

  if (c_flag) {
    // Measure cache behavior
    measure_cache() ;
  }
  else if (t_flag) {
    // Measure TLB behavior
    measure_tlb() ;
  }
  else if (i_flag) {
    // Measure IPC
    measure_cpi() ;
  } else {
    // Don't measurement any events
    run_no_events() ;
  }

  print_heading("Loop nest interchange matrix multiplication") ;
  print_system_info() ;

  // Number of ARM instructions in the loop nest
  outer = 8 * MSIZE ;
  middle = 9 * MSIZE * MSIZE ;
  inner = 9 * MSIZE * MSIZE * MSIZE ;
  total = outer + middle + inner ;
  fprintf(result_file, "Run information\n") ;
  fprintf(result_file, "  Outer loop instructions:  %lu\n", outer) ;
  fprintf(result_file, "  Middle loop instructions: %lu\n", middle) ;
  fprintf(result_file, "  Inner loop instructions:  %lu\n", inner) ;
  fprintf(result_file, "  Total instructions:       %lu\n", total) ;

  elapsed = get_elapsed_time() ;
  cpu_speed = get_cpu_speed() ; // MHz
  cycles = (long long int)(elapsed * 1000000.0 * (double)cpu_speed) ;
  fprintf(result_file, "  Estimated cycles:         %llu\n", cycles) ;
  fprintf(result_file, "  Cycles (scaled by 64):    %llu\n", cycles / 64) ;

  fprintf(result_file,"Performance Monitor events\n") ;
  print_counts(result_file) ;

  fprintf(result_file, "\nRun execution summary\n") ;
  print_process_times() ;
  print_elapsed_time() ;

  close_result_file() ;
  return( EXIT_SUCCESS ) ;
}