diff --git a/3rdPartyFiles.txt b/3rdPartyFiles.txt index 69026bd..f312143 100644 --- a/3rdPartyFiles.txt +++ b/3rdPartyFiles.txt @@ -22,6 +22,8 @@ rpi-gpio.c rpi-mailbox-interface.h (Added some missing ones) rpi-mailbox-interface.c linker.ld +performance.h +performance.c R Stange's USPi https://github.com/rsta2/uspi diff --git a/Makefile b/Makefile index 8df1180..89e64ec 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ OBJS = armc-start.o armc-cstartup.o armc-cstubs.o armc-cppstubs.o \ exception.o main.o rpi-aux.o rpi-i2c.o rpi-mailbox-interface.o rpi-mailbox.o \ - rpi-gpio.o rpi-interrupts.o cache.o ff.o interrupt.o Keyboard.o \ + rpi-gpio.o rpi-interrupts.o cache.o ff.o interrupt.o Keyboard.o performance.o \ Pi1541.o DiskImage.o iec_bus.o iec_commands.o m6502.o m6522.o \ Drive.o gcr.o prot.o lz.o emmc.o diskio.o options.o Screen.o SSD1306.o ScreenLCD.o \ Timer.o FileBrowser.o DiskCaddy.o ROMs.o InputMappings.o xga_font_data.o diff --git a/src/Drive.cpp b/src/Drive.cpp index 6df8aa9..23f902c 100644 --- a/src/Drive.cpp +++ b/src/Drive.cpp @@ -20,6 +20,16 @@ #include "m6522.h" #include "debug.h" + +//#define PROFILE 1 + +#if defined(PROFILE) +extern "C" +{ +#include "performance.h" +} +#endif + // There is a lot going on even though the emulation code is extremely small. // A few counters, shift registers and the occasional logic gate takes a surprisingly small amount of code to implement. @@ -388,6 +398,33 @@ void Drive::OnPortOut(void* pThis, unsigned char status) bool Drive::Update() { +#if defined(PROFILE) + perf_counters_t pct; + reset_performance_counters(&pct); + +#if defined(RPI2) || defined(RPI3) + pct.num_counters = 6; + pct.type[0] = PERF_TYPE_L1I_CACHE; + pct.type[1] = PERF_TYPE_L1I_CACHE_REFILL; + pct.type[2] = PERF_TYPE_L1D_CACHE; + pct.type[3] = PERF_TYPE_L1D_CACHE_REFILL; + pct.type[4] = PERF_TYPE_L2D_CACHE_REFILL; + pct.type[5] = PERF_TYPE_INST_RETIRED; + pct.counter[0] = 100; + pct.counter[1] = 101; + pct.counter[2] = 102; + pct.counter[3] = 103; + pct.counter[4] = 104; + pct.counter[5] = 105; +#else + pct.num_counters = 2; + //pct.type[0] = PERF_TYPE_EVERY_CYCLE; + pct.type[0] = PERF_TYPE_I_CACHE_MISS; + pct.type[1] = PERF_TYPE_D_CACHE_MISS; +#endif + +#endif + bool dataReady = false; // When swapping some lame loaders monitor the write protect flag. @@ -497,5 +534,11 @@ bool Drive::Update() } } m_pVIA->InputCA1(!SO); + +#if defined(PROFILE) + read_performance_counters(&pct); + print_performance_counters(&pct); +#endif + return dataReady; } diff --git a/src/performance.c b/src/performance.c new file mode 100644 index 0000000..041655c --- /dev/null +++ b/src/performance.c @@ -0,0 +1,300 @@ +// Part of PiTubeDirect +// https://github.com/hoglet67/PiTubeDirect +#include +#include +#include +#include "startup.h" +#include "performance.h" + +#if defined(RPI3) + +const char * type_names[] = { + + "SW_INCR", + "L1I_CACHE_REFILL", + "L1I_TLB_REFILL", + "L1D_CACHE_REFILL", + "L1D_CACHE", + "L1D_TLB_REFILL", + "LD_RETIRED", + "ST_RETIRED", + "INST_RETIRED", + "EXC_TAKEN", + "EXC_RETURN", + "CID_WRITE_RETIRED", + "PC_WRITE_RETIRED", + "BR_IMM_RETIRED", + "BR_RETURN_RETIRED", + "UNALIGNED_LDST_RETIRED", + "BR_MIS_PRED", + "CPU_CYCLES", + "BR_PRED", + "MEM_ACCESS", + "L1I_CACHE", + "L1D_CACHE_WB", + "L2D_CACHE", + "L2D_CACHE_REFILL", + "L2D_CACHE_WB", + "BUS_ACCESS", + "MEMORY_ERROR", + "INST_SPEC", + "TTRB_WRITE_RETIRED", + "BUS_CYCLES", + "CHAIN", + "L1D_CACHE_ALLOCATE" +}; + +#elif defined(RPI2) + +const char * type_names[] = { + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO", + "TODO" +}; + +#else + +const char * type_names[] = { + "I_CACHE_MISS", + "IBUF_STALL", + "DATA_DEP_STALL", + "I_MICROTLB_MISS", + "D_MICROTLB_MISS", + "BRANCH_EXECUTED", + "BRANCH_PRED_INCORRECT", + "INSTRUCTION_EXECUTED", + "UNDEFINED", + "D_CACHE_ACCESS_CACHEABLE", + "D_CACHE_ACCESS", + "D_CACHE_MISS", + "D_CACHE_WRITEBACK", + "SOFTWARE_CHANGED_PC", + "UNDEFINED", + "MAINTLB_MISS", + "EXPLICIT_DATA_ACCESS", + "FULL_LOAD_STORE_REQ_QUEUE", + "WRITE_BUFF_DRAINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "EXT0", + "EXT1", + "EXT0_AND_EXT1", + "PROC_RETURN_PUSHED", + "PROC_RETURN_POPPED", + "PROC_RETURN_PRED_CORRECT", + "PROC_RETURN_PRED_INCORRECT", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", + "UNDEFINED", +}; + +#endif + +const char *type_lookup(int type) { + static const char *UNKNOWN = "UNKNOWN"; + int num_types = sizeof(type_names) / sizeof(type_names[0]); + if (type >= 0 || type < num_types) { + return type_names[type]; + } + else { + return UNKNOWN; + } +} + + +// Set control register and zero counters +void reset_performance_counters(perf_counters_t *pct) { + // bit 3 = 1 means count every 64th processor cycle + // bit 2 = 1 means reset cycle counter to zero + // bit 1 = 1 means reset counters to zero + // bit 0 = 1 enable counters + unsigned ctrl = 0x0F; + + +#if defined(RPI2) || defined(RPI3) + int i; + unsigned cntenset = (1 << 31); + + unsigned type_impl; + + // Read the common event identification register to see test whether the requested event is implemented + asm volatile ("mrc p15,0,%0,c9,c12,6" : "=r" (type_impl)); + + for (i = 0; i < pct->num_counters; i++) { + if ((type_impl >> pct->type[i]) & 1) { + // Select the event count/type via the event type selection register + asm volatile ("mcr p15,0,%0,c9,c12,5" :: "r" (i) : "memory"); + // Configure the required event type + asm volatile ("mcr p15,0,%0,c9,c13,1" :: "r" (pct->type[i]) : "memory"); + // Set the bit to enable the counter + cntenset |= (1 << i); + } + else { + printf("Event: %s not implemented\r\n", type_lookup(pct->type[i])); + } + } + // Write the control register + asm volatile ("mcr p15,0,%0,c9,c12,0" :: "r" (ctrl) : "memory"); + + // Enable the counters + asm volatile ("mcr p15,0,%0,c9,c12,1" :: "r" (cntenset) : "memory"); +#else + // Only two counters (0 and 1) are supported on the arm11 + ctrl |= (pct->type[0] << 20); + ctrl |= (pct->type[1] << 12); + asm volatile ("mcr p15,0,%0,c15,c12,0" :: "r" (ctrl) : "memory"); +#endif +} + +void read_performance_counters(perf_counters_t *pct) { +#if defined(RPI2) || defined(RPI3) + int i; + for (i = 0; i < pct->num_counters; i++) { + // Select the event count/type via the event type selection register + asm volatile ("mcr p15,0,%0,c9,c12,5" :: "r" (i) : "memory"); + // Read the required event count + asm volatile ("mrc p15,0,%0,c9,c13,2" : "=r" (pct->counter[i])); + } + asm volatile ("mrc p15,0,%0,c9,c13,0" : "=r" (pct->cycle_counter)); +#else + // Only two counters (0 and 1) are supported on the arm11 + asm volatile ("mrc p15,0,%0,c15,c12,2" : "=r" (pct->counter[0])); + asm volatile ("mrc p15,0,%0,c15,c12,3" : "=r" (pct->counter[1])); + asm volatile ("mrc p15,0,%0,c15,c12,1" : "=r" (pct->cycle_counter)); +#endif +} + +void print_performance_counters(perf_counters_t *pct) { + int i; + uint64_t cycle_counter = pct->cycle_counter; + cycle_counter *= 64; + printf("%26s = %"PRIu64"\r\n", "cycle counter", cycle_counter); + for (i = 0; i < pct->num_counters; i++) { + printf("%26s = %u\r\n", type_lookup(pct->type[i]), pct->counter[i]); + } +} +/* +int benchmark() { + int i; + int total; + int size; + perf_counters_t pct; + unsigned char mem1[1024 * 1024]; + unsigned char mem2[1024 * 1024]; + +#if defined(RPI2) || defined(RPI3) + pct.num_counters = 6; + pct.type[0] = PERF_TYPE_L1I_CACHE; + pct.type[1] = PERF_TYPE_L1I_CACHE_REFILL; + pct.type[2] = PERF_TYPE_L1D_CACHE; + pct.type[3] = PERF_TYPE_L1D_CACHE_REFILL; + pct.type[4] = PERF_TYPE_L2D_CACHE_REFILL; + pct.type[5] = PERF_TYPE_INST_RETIRED; + pct.counter[0] = 100; + pct.counter[1] = 101; + pct.counter[2] = 102; + pct.counter[3] = 103; + pct.counter[4] = 104; + pct.counter[5] = 105; +#else + pct.num_counters = 2; + pct.type[0] = PERF_TYPE_I_CACHE_MISS; + pct.type[1] = PERF_TYPE_D_CACHE_MISS; +#endif + + printf("benchmarking core....\r\n"); + reset_performance_counters(&pct); + // These only work on Pi 1 + //_invalidate_icache(); + //_invalidate_dcache(); + total = 0; + for (i = 0; i < 1000000; i++) { + if ((i & 3) == 0) { + total += i; + } + else { + total -= i; + } + } + read_performance_counters(&pct); + print_performance_counters(&pct); + +#ifdef HAS_40PINS + printf("benchmarking io toggling....\r\n"); + reset_performance_counters(&pct); + _toggle_test_pin(1000000); + read_performance_counters(&pct); + print_performance_counters(&pct); +#endif + + for (i = 0; i <= 10; i++) { + size = 1 << i; + printf("benchmarking %dKB memory copy....\r\n", size); + size *= 1024; + reset_performance_counters(&pct); + // These only work on Pi 1 + //_invalidate_icache(); + //_invalidate_dcache(); + memcpy(mem1, mem2, size); + read_performance_counters(&pct); + print_performance_counters(&pct); + } + + return total; +} +*/ diff --git a/src/performance.h b/src/performance.h new file mode 100644 index 0000000..f9b1d43 --- /dev/null +++ b/src/performance.h @@ -0,0 +1,107 @@ +// Part of PiTubeDirect +// https://github.com/hoglet67/PiTubeDirect +// performance.h + +#ifndef PERFORMANCE_H +#define PERFORMANCE_H + +#if defined(RPI3) || defined(RPI2) + +// TODO - More work is needed on the RPI2 performance metrics + +#define MAX_COUNTERS 6 + +#define PERF_TYPE_SW_INCR 0x00 +#define PERF_TYPE_L1I_CACHE_REFILL 0x01 +#define PERF_TYPE_L1I_TLB_REFILL 0x02 +#define PERF_TYPE_L1D_CACHE_REFILL 0x03 +#define PERF_TYPE_L1D_CACHE 0x04 +#define PERF_TYPE_L1D_TLB_REFILL 0x05 +#define PERF_TYPE_LD_RETIRED 0x06 +#define PERF_TYPE_ST_RETIRED 0x07 +#define PERF_TYPE_INST_RETIRED 0x08 +#define PERF_TYPE_EXC_TAKEN 0x09 +#define PERF_TYPE_EXC_RETURN 0x0A +#define PERF_TYPE_CID_WRITE_RETIRED 0x0B +#define PERF_TYPE_PC_WRITE_RETIRED 0x0C +#define PERF_TYPE_BR_IMM_RETIRED 0x0D +#define PERF_TYPE_BR_RETURN_RETIRED 0x0E +#define PERF_TYPE_UNALIGNED_LDST_RETIRED 0x0F +#define PERF_TYPE_BR_MIS_PRED 0x10 +#define PERF_TYPE_CPU_CYCLES 0x11 +#define PERF_TYPE_BR_PRED 0x12 +#define PERF_TYPE_MEM_ACCESS 0x13 +#define PERF_TYPE_L1I_CACHE 0x14 +#define PERF_TYPE_L1D_CACHE_WB 0x15 +#define PERF_TYPE_L2D_CACHE 0x16 +#define PERF_TYPE_L2D_CACHE_REFILL 0x17 +#define PERF_TYPE_L2D_CACHE_WB 0x18 +#define PERF_TYPE_BUS_ACCESS 0x19 +#define PERF_TYPE_MEMORY_ERROR 0x1A +#define PERF_TYPE_INST_SPEC 0x1B +#define PERF_TYPE_TTRB_WRITE_RETIRED 0x1C +#define PERF_TYPE_BUS_CYCLES 0x1D +#define PERF_TYPE_CHAIN 0x1E +#define PERF_TYPE_L1D_CACHE_ALLOCATE 0x1F + +#else + +// See http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dai0195b/index.html +// Read carefully the definitions of the ARM1176 performance events. The “data cache access” events, in particular, only count nonsequential data cache accesses. This important qualification affects the interpretation of performance measurements. In particular, you can’t compute a pure data cache miss ratio, that is, all data cache misses divided by all data cache accesses. +#define MAX_COUNTERS 2 + +#define PERF_TYPE_I_CACHE_MISS 0x00 // Instruction cache miss. Instruction cache miss to a cacheable location, which requires a fetch from external memory +#define PERF_TYPE_IBUF_STALL 0x01 // Stall because instruction buffer cannot deliver an instruction. This could indicate an Instruction Cache miss or an Instruction MicroTLB miss. This event occurs every cycle in which the condition is present. +#define PERF_TYPE_DATA_DEP_STALL 0x02 // Stall because of a data dependency. This event occurs every cycle in which the condition is present. +#define PERF_TYPE_I_MICROTLB_MISS 0x03 // Instruction MicroTLB miss (unused on ARM1156). +#define PERF_TYPE_D_MICROTLB_MISS 0x04 // Data MicroTLB miss (unused on ARM1156). +#define PERF_TYPE_BRANCH_EXECUTED 0x05 // Branch instruction executed, branch might or might not have changed program flow. +#define PERF_TYPE_BRANCH_PRED_INCORRECT 0x06 // Branch mis-predicted. +#define PERF_TYPE_INSTRUCTION_EXECUTED 0x07 // Instructions executed. +#define PERF_TYPE_D_CACHE_ACCESS_CACHEABLE 0x09 // Data cache access, not including Cache operations. This event occurs for each non-sequential access to a cache line, for cacheable locations. +#define PERF_TYPE_D_CACHE_ACCESS 0x0A // Data cache access, not including Cache Operations. This event occurs for each non-sequential access to a cache line, regardless of whether or not the location is cacheable. +#define PERF_TYPE_D_CACHE_MISS 0x0B // Data cache miss, not including Cache Operations. +#define PERF_TYPE_D_CACHE_WRITEBACK 0x0C // Data cache write-back. This event occurs once for each half line of four words that is written back from the cache. +#define PERF_TYPE_SOFTWARE_CHANGED_PC 0x0D // Software changed the PC. This event occurs any time the PC is changed by software and there is not a mode change. For example, a MOV instruction with PC as the destination triggers this event. Executing a SWI from User mode does not trigger this event, because it incurs a mode change. +#define PERF_TYPE_MAINTLB_MISS 0x0F // Main TLB miss (unused on ARM1156). +#define PERF_TYPE_EXPLICIT_DATA_ACCESS 0x10 // Explicit external data or peripheral access. This includes cache refill, non-cacheable and write-through accesses. It does not include write-backs or page table walks. +#define PERF_TYPE_FULL_LOAD_STORE_REQ_QUEUE 0x11 // Stall because of Load Store Unit request queue being full. This event occurs each clock cycle in which the condition is met. A high incidence of this event indicates the LSU is often waiting for transactions to complete on the external bus. +#define PERF_TYPE_WRITE_BUFF_DRAINED 0x12 // The number of times the Write Buffer was drained because of a Data Synchronization Barrier command or Strongly Ordered operation. +// 0x13 The number of cycles which FIQ interrupts are disabled (ARM1156 only). +// 0x14 The number of cycles which IRQ interrupts are disabled (ARM1156 only). +#define PERF_TYPE_EXT0 0x20 // ETMEXTOUT[0] signal was asserted for a cycle. +#define PERF_TYPE_EXT1 0x21 // ETMEXTOUT[1] signal was asserted for a cycle. +#define PERF_TYPE_EXT0_AND_EXT1 0x22 // ETMEXTOUT[0] or ETMEXTOUT[1] was asserted. If both ETMEXTOUT[0] and ETMEXTOUT[1] signals are asserted then the count is incremented by two. +#define PERF_TYPE_PROC_RETURN_PUSHED 0x23 // Procedure call instruction executed. The procedure return address was pushed on to the return stack (ARM1176 only). +#define PERF_TYPE_PROC_RETURN_POPPED 0x24 // Procedure return instruction executed. The procedure return address was popped off the return stack (ARM1176 only). +#define PERF_TYPE_PROC_RETURN_PRED_CORRECT 0x25 // Procedure return instruction executed and return address predicted. The procedure return address was popped off the return stack and the core branched to this address (ARM1176 only). +#define PERF_TYPE_PROC_RETURN_PRED_INCORRECT 0x26 // Procedure return instruction executed and return address predicted incorrectly. The procedure return address was restored to the return stack following the prediction being identified as incorrect (ARM1176 only). +// 0x30 Instruction cache Tag or Valid RAM parity error (ARM1156 only). +// 0x31 Instruction cache RAM parity error (ARM1156 only). +// 0x32 Data cache Tag or Valid RAM parity error (ARM1156 only). +// 0x33 Data cache RAM parity error (ARM1156 only). +// 0x34 ITCM error (ARM1156 only). +// 0x35 DTCM error (ARM1156 only). +// 0x36 Procedure return address popped off the return stack (ARM1156 only). +// 0x37 Procedure return address popped off the return stack has been incorrectly predicted by the PFU (ARM1156 only). +// 0x38 Data cache Dirty RAM parity error (ARM1156 only). +#define PERF_TYPE_EVERY_CYCLE 0xFF // An increment each cycle. + +#endif + +typedef struct { + unsigned cycle_counter; + int num_counters; + int type[MAX_COUNTERS]; + unsigned counter[MAX_COUNTERS];; +} perf_counters_t; + +extern void reset_performance_counters(perf_counters_t *pct); + +extern void read_performance_counters(perf_counters_t *pct); + +extern void print_performance_counters(perf_counters_t *pct); + +//extern int benchmark(); + +#endif