2bc501dd85
To see the output. Add -DDEBUG to CFLAGS in Makefile.rules Hook up a terminal to the PI's serial port (pins 8 and 10) 115200 Baud 8 Data bits 1 Stop bit No Parity
300 lines
6.5 KiB
C
300 lines
6.5 KiB
C
// Part of PiTubeDirect
|
|
// https://github.com/hoglet67/PiTubeDirect
|
|
#include <stdio.h>
|
|
#include <inttypes.h>
|
|
#include <string.h>
|
|
#include "startup.h"
|
|
#include "performance.h"
|
|
|
|
#if defined(RPI3)
|
|
|
|
const char * type_names[] = {
|
|
|
|
"SW_INCR",
|
|
"L1I_CACHE_REFILL",
|
|
"L1I_TLB_REFILL",
|
|
"L1D_CACHE_REFILL",
|
|
"L1D_CACHE",
|
|
"L1D_TLB_REFILL",
|
|
"LD_RETIRED",
|
|
"ST_RETIRED",
|
|
"INST_RETIRED",
|
|
"EXC_TAKEN",
|
|
"EXC_RETURN",
|
|
"CID_WRITE_RETIRED",
|
|
"PC_WRITE_RETIRED",
|
|
"BR_IMM_RETIRED",
|
|
"BR_RETURN_RETIRED",
|
|
"UNALIGNED_LDST_RETIRED",
|
|
"BR_MIS_PRED",
|
|
"CPU_CYCLES",
|
|
"BR_PRED",
|
|
"MEM_ACCESS",
|
|
"L1I_CACHE",
|
|
"L1D_CACHE_WB",
|
|
"L2D_CACHE",
|
|
"L2D_CACHE_REFILL",
|
|
"L2D_CACHE_WB",
|
|
"BUS_ACCESS",
|
|
"MEMORY_ERROR",
|
|
"INST_SPEC",
|
|
"TTRB_WRITE_RETIRED",
|
|
"BUS_CYCLES",
|
|
"CHAIN",
|
|
"L1D_CACHE_ALLOCATE"
|
|
};
|
|
|
|
#elif defined(RPI2)
|
|
|
|
const char * type_names[] = {
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO",
|
|
"TODO"
|
|
};
|
|
|
|
#else
|
|
|
|
const char * type_names[] = {
|
|
"I_CACHE_MISS",
|
|
"IBUF_STALL",
|
|
"DATA_DEP_STALL",
|
|
"I_MICROTLB_MISS",
|
|
"D_MICROTLB_MISS",
|
|
"BRANCH_EXECUTED",
|
|
"BRANCH_PRED_INCORRECT",
|
|
"INSTRUCTION_EXECUTED",
|
|
"UNDEFINED",
|
|
"D_CACHE_ACCESS_CACHEABLE",
|
|
"D_CACHE_ACCESS",
|
|
"D_CACHE_MISS",
|
|
"D_CACHE_WRITEBACK",
|
|
"SOFTWARE_CHANGED_PC",
|
|
"UNDEFINED",
|
|
"MAINTLB_MISS",
|
|
"EXPLICIT_DATA_ACCESS",
|
|
"FULL_LOAD_STORE_REQ_QUEUE",
|
|
"WRITE_BUFF_DRAINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"EXT0",
|
|
"EXT1",
|
|
"EXT0_AND_EXT1",
|
|
"PROC_RETURN_PUSHED",
|
|
"PROC_RETURN_POPPED",
|
|
"PROC_RETURN_PRED_CORRECT",
|
|
"PROC_RETURN_PRED_INCORRECT",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
"UNDEFINED",
|
|
};
|
|
|
|
#endif
|
|
|
|
const char *type_lookup(int type) {
|
|
static const char *UNKNOWN = "UNKNOWN";
|
|
int num_types = sizeof(type_names) / sizeof(type_names[0]);
|
|
if (type >= 0 || type < num_types) {
|
|
return type_names[type];
|
|
}
|
|
else {
|
|
return UNKNOWN;
|
|
}
|
|
}
|
|
|
|
|
|
// Set control register and zero counters
|
|
void reset_performance_counters(perf_counters_t *pct) {
|
|
// bit 3 = 1 means count every 64th processor cycle
|
|
// bit 2 = 1 means reset cycle counter to zero
|
|
// bit 1 = 1 means reset counters to zero
|
|
// bit 0 = 1 enable counters
|
|
unsigned ctrl = 0x0F;
|
|
|
|
|
|
#if defined(RPI2) || defined(RPI3)
|
|
int i;
|
|
unsigned cntenset = (1 << 31);
|
|
|
|
unsigned type_impl;
|
|
|
|
// Read the common event identification register to see test whether the requested event is implemented
|
|
asm volatile ("mrc p15,0,%0,c9,c12,6" : "=r" (type_impl));
|
|
|
|
for (i = 0; i < pct->num_counters; i++) {
|
|
if ((type_impl >> pct->type[i]) & 1) {
|
|
// Select the event count/type via the event type selection register
|
|
asm volatile ("mcr p15,0,%0,c9,c12,5" :: "r" (i) : "memory");
|
|
// Configure the required event type
|
|
asm volatile ("mcr p15,0,%0,c9,c13,1" :: "r" (pct->type[i]) : "memory");
|
|
// Set the bit to enable the counter
|
|
cntenset |= (1 << i);
|
|
}
|
|
else {
|
|
printf("Event: %s not implemented\r\n", type_lookup(pct->type[i]));
|
|
}
|
|
}
|
|
// Write the control register
|
|
asm volatile ("mcr p15,0,%0,c9,c12,0" :: "r" (ctrl) : "memory");
|
|
|
|
// Enable the counters
|
|
asm volatile ("mcr p15,0,%0,c9,c12,1" :: "r" (cntenset) : "memory");
|
|
#else
|
|
// Only two counters (0 and 1) are supported on the arm11
|
|
ctrl |= (pct->type[0] << 20);
|
|
ctrl |= (pct->type[1] << 12);
|
|
asm volatile ("mcr p15,0,%0,c15,c12,0" :: "r" (ctrl) : "memory");
|
|
#endif
|
|
}
|
|
|
|
void read_performance_counters(perf_counters_t *pct) {
|
|
#if defined(RPI2) || defined(RPI3)
|
|
int i;
|
|
for (i = 0; i < pct->num_counters; i++) {
|
|
// Select the event count/type via the event type selection register
|
|
asm volatile ("mcr p15,0,%0,c9,c12,5" :: "r" (i) : "memory");
|
|
// Read the required event count
|
|
asm volatile ("mrc p15,0,%0,c9,c13,2" : "=r" (pct->counter[i]));
|
|
}
|
|
asm volatile ("mrc p15,0,%0,c9,c13,0" : "=r" (pct->cycle_counter));
|
|
#else
|
|
// Only two counters (0 and 1) are supported on the arm11
|
|
asm volatile ("mrc p15,0,%0,c15,c12,2" : "=r" (pct->counter[0]));
|
|
asm volatile ("mrc p15,0,%0,c15,c12,3" : "=r" (pct->counter[1]));
|
|
asm volatile ("mrc p15,0,%0,c15,c12,1" : "=r" (pct->cycle_counter));
|
|
#endif
|
|
}
|
|
|
|
void print_performance_counters(perf_counters_t *pct) {
|
|
int i;
|
|
uint64_t cycle_counter = pct->cycle_counter;
|
|
cycle_counter *= 64;
|
|
printf("%26s = %"PRIu64"\r\n", "cycle counter", cycle_counter);
|
|
for (i = 0; i < pct->num_counters; i++) {
|
|
printf("%26s = %u\r\n", type_lookup(pct->type[i]), pct->counter[i]);
|
|
}
|
|
}
|
|
/*
|
|
int benchmark() {
|
|
int i;
|
|
int total;
|
|
int size;
|
|
perf_counters_t pct;
|
|
unsigned char mem1[1024 * 1024];
|
|
unsigned char mem2[1024 * 1024];
|
|
|
|
#if defined(RPI2) || defined(RPI3)
|
|
pct.num_counters = 6;
|
|
pct.type[0] = PERF_TYPE_L1I_CACHE;
|
|
pct.type[1] = PERF_TYPE_L1I_CACHE_REFILL;
|
|
pct.type[2] = PERF_TYPE_L1D_CACHE;
|
|
pct.type[3] = PERF_TYPE_L1D_CACHE_REFILL;
|
|
pct.type[4] = PERF_TYPE_L2D_CACHE_REFILL;
|
|
pct.type[5] = PERF_TYPE_INST_RETIRED;
|
|
pct.counter[0] = 100;
|
|
pct.counter[1] = 101;
|
|
pct.counter[2] = 102;
|
|
pct.counter[3] = 103;
|
|
pct.counter[4] = 104;
|
|
pct.counter[5] = 105;
|
|
#else
|
|
pct.num_counters = 2;
|
|
pct.type[0] = PERF_TYPE_I_CACHE_MISS;
|
|
pct.type[1] = PERF_TYPE_D_CACHE_MISS;
|
|
#endif
|
|
|
|
printf("benchmarking core....\r\n");
|
|
reset_performance_counters(&pct);
|
|
// These only work on Pi 1
|
|
//_invalidate_icache();
|
|
//_invalidate_dcache();
|
|
total = 0;
|
|
for (i = 0; i < 1000000; i++) {
|
|
if ((i & 3) == 0) {
|
|
total += i;
|
|
}
|
|
else {
|
|
total -= i;
|
|
}
|
|
}
|
|
read_performance_counters(&pct);
|
|
print_performance_counters(&pct);
|
|
|
|
#ifdef HAS_40PINS
|
|
printf("benchmarking io toggling....\r\n");
|
|
reset_performance_counters(&pct);
|
|
_toggle_test_pin(1000000);
|
|
read_performance_counters(&pct);
|
|
print_performance_counters(&pct);
|
|
#endif
|
|
|
|
for (i = 0; i <= 10; i++) {
|
|
size = 1 << i;
|
|
printf("benchmarking %dKB memory copy....\r\n", size);
|
|
size *= 1024;
|
|
reset_performance_counters(&pct);
|
|
// These only work on Pi 1
|
|
//_invalidate_icache();
|
|
//_invalidate_dcache();
|
|
memcpy(mem1, mem2, size);
|
|
read_performance_counters(&pct);
|
|
print_performance_counters(&pct);
|
|
}
|
|
|
|
return total;
|
|
}
|
|
*/
|