From 6845790ab88234386579a9d7aa571bd6ee50799c Mon Sep 17 00:00:00 2001 From: Per Lindgren <per.lindgren@ltu.se> Date: Wed, 11 Nov 2020 22:39:09 +0100 Subject: [PATCH] rtt_timing --- examples/rtt_simple.rs | 43 ++++++ examples/rtt_timing.rs | 319 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 362 insertions(+) create mode 100644 examples/rtt_simple.rs create mode 100644 examples/rtt_timing.rs diff --git a/examples/rtt_simple.rs b/examples/rtt_simple.rs new file mode 100644 index 0000000..c9005a4 --- /dev/null +++ b/examples/rtt_simple.rs @@ -0,0 +1,43 @@ +//! examples/rtt_simple.rs + +#![deny(unsafe_code)] +#![deny(warnings)] +#![no_main] +#![no_std] + +use panic_rtt_target as _; +use rtt_target::{rprintln, rtt_init_print}; +use stm32f4; + +#[rtic::app(device = stm32f4)] +const APP: () = { + #[init] + fn init(_cx: init::Context) { + rtt_init_print!(); + rprintln!("init"); + } + + #[idle] + fn idle(_cx: idle::Context) -> ! { + rprintln!("idle"); + loop { + continue; + } + } +}; + +// > cargo run rtt_simple +// Compiling app v0.1.0 (/home/pln/courses/e7020e/app) +// Finished dev [unoptimized + debuginfo] target(s) in 0.15s +// Running `probe-run --chip STM32F411RETx target/thumbv7em-none-eabi/debug/examples/rtt_simple` +// flashing program .. +// DONE +// resetting device +// init +// idle +// +// > [Ctrl-C] +// stack backtrace: +// 0: 0x0800031a - rtt_simple::idle +// 1: 0x0800036c - main +// 2: 0x080001da - Reset diff --git a/examples/rtt_timing.rs b/examples/rtt_timing.rs new file mode 100644 index 0000000..7f0c575 --- /dev/null +++ b/examples/rtt_timing.rs @@ -0,0 +1,319 @@ +//! examples/rtt_timing.rs + +// #![deny(unsafe_code)] +#![deny(warnings)] +#![no_main] +#![no_std] + +use cortex_m::{asm, peripheral::DWT}; +use panic_rtt_target as _; +use rtt_target::{rprintln, rtt_init_print}; +use stm32f4; + +#[rtic::app(device = stm32f4)] +const APP: () = { + #[init] + fn init(mut cx: init::Context) { + rtt_init_print!(); + rprintln!("init"); + + // Initialize (enable) the monotonic timer (CYCCNT) + cx.core.DCB.enable_trace(); + cx.core.DWT.enable_cycle_counter(); + + rprintln!("start timed_loop"); + let (start, end) = timed_loop(); + rprintln!( + "start {}, end {}, diff {}", + start, + end, + end.wrapping_sub(start) + ); + } + + #[idle] + fn idle(_cx: idle::Context) -> ! { + rprintln!("idle"); + loop { + continue; + } + } +}; + +// Forbid inlining and keeping name (symbol) readable. +#[inline(never)] +#[no_mangle] +fn timed_loop() -> (u32, u32) { + let start = DWT::get_cycle_count(); + for _ in 0..10000 { + asm::nop(); + } + let end = DWT::get_cycle_count(); + (start, end) +} + +// We can measure execution time using the built in cycle counter, +// a wrapping 32 bit timer. +// +// `DWT::get_cycle_count()` reads the current value. +// +// ------------------------------------------------------------------------ +// Exercises: +// +// A.1) What is the cycle count for the loop? +// > cargo run --example rtt_timing +// +// [Your answer here] +// +// A.2) How many cycles per iteration? +// +// [Your answer here] +// +// ------------------------------------------------------------------------ +// Now try a release (optimized build, see `Cargo.toml` for build options). +// B.1) What is the cycle count for the loop? +// > cargo run --example rtt_timing --release +// +// [Your answer here] +// +// B.2) How many cycles per iteration? +// +// [Your answer here] +// +// What is the speedup (A/B)? +// +// [Your answer here] +// +// Why do you think it differs that much? +// +// [Your answer here] +// +// ------------------------------------------------------------------------ +// In the loop there is just a single assembly instruction (nop). +// However, the API for "inline assembly" is not yet stabilized, thus +// we have to call an "external" functions that implements the assembly. +// +// We can switch to the "nightly" channel (which allows experimental features). +// +// > rustup override set nightly +// info: using existing install for 'nightly-x86_64-unknown-linux-gnu' +// info: override toolchain for '/home/pln/courses/e7020e/app' set to 'nightly-x86_64-unknown-linux-gnu' +// +// nightly-x86_64-unknown-linux-gnu unchanged - rustc 1.49.0-nightly (cf9cf7c92 2020-11-10) +// +// > rustup target add thumbv7em-none-eabi +// (only needed first time) +// +// Now try a release (optimized build, see `Cargo.toml` for build options). +// C.1) What is the cycle count for the loop? +// > cargo run --example rtt_timing --release +// +// [Your answer here] +// +// C.2) How many cycles per iteration? +// +// [Your answer here] +// +// What is the speedup (A/C)? +// +// [Your answer here] +// +// ------------------------------------------------------------------------ +// D) Now lets have a closer look at the generated assembly. +// +// > cargo objdump --example rtt_timing --release --features nightly -- --disassemble > rtt_timing.objdump +// +// Open the file in you editor and search for the `timed_loop`. +// +// [Assembly for function here] +// +// Locate the loop body, and verify that it makes sense +// based on the information from the technical documentation: +// +// https://developer.arm.com/documentation/ddi0439/b/Programmers-Model/Instruction-set-summary/Cortex-M4-instructions +// +// +// ------------------------------------------------------------------------ +// E) Now we shall take detailed control over the debugging. +// +// Alter the .cargo/config +// uncomment: +// runner = "arm-none-eabi-gdb -q -x openocd.gdb" +// or (if under ubuntu) +// runner = "gdb-multiarch -q -x openocd.gdb" +// +// comment: +// # runner = "probe-run --chip STM32F411RETx" +// +// Remember to save the file! +// +// In a separate terminal start `openocd`. +// > openocd -f openocd.cfg +// And in the main terminal run: +// > cargo run --example rtt_timing --release --features nightly +// ... +// halted: PC: 0x0800019a +// 0x0800019a in cortex_m_rt::Reset () at /home/pln/.cargo/registry/src/github.com-1ecc6299db9ec823/cortex-m-rt-0.6.13/src/lib.rs:497 +// 497 pub unsafe extern "C" fn Reset() -> ! { +// +// Now add a breakpoint at `timed_loop`. +// (gdb) break timed_loop +// Breakpoint 5 at 0x800023e: file examples/rtt_timing.rs, line 45. +// +// Now we can continue (first break will be at `init`) +// (gdb) continue +// Breakpoint 4, cortex_m::interrupt::disable () at /home/pln/.cargo/registry/src/github.com-1ecc6299db9ec823/cortex-m-0.6.4/src/interrupt.rs:13 +// llvm_asm!("cpsid i" ::: "memory" : "volatile"); +// (`init` in RTIC runs with interrupts disabled) +// (gdb) continue +// Breakpoint 5, core::ptr::read_volatile<u32> (src=<optimized out>) at /home/pln/.rustup/toolchains/nightly-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/ptr/mod.rs:1052 +// unsafe { intrinsics::volatile_load(src) } +// +// Now we are inside the `timed_loop`. +// disassemble +// Dump of assembler code for function _ZN10rtt_timing10timed_loop17h4a445e5f592f3304E: +// 0x08000232 <+0>: movw r1, #4100 ; 0x1004 +// 0x08000236 <+4>: movw r2, #10000 ; 0x2710 +// 0x0800023a <+8>: movt r1, #57344 ; 0xe000 +// => 0x0800023e <+12>: ldr r0, [r1, #0] +// 0x08000240 <+14>: subs r2, #1 +// 0x08000242 <+16>: nop +// 0x08000244 <+18>: bne.n 0x8000240 <rtt_timing::timed_loop+14> +// 0x08000246 <+20>: ldr r1, [r1, #0] +// 0x08000248 <+22>: bx lr +// +// (breakpoints can be a bit "off" for release builds, here it +// put the breakpoint not at the function entry but rather at the +// first Rust line (the load done by `DWT::get_cycle_count()`)). +// +// Is there a function call to the DWT::get_cycle_count(), +// if not can you explain what has happened here? +// In particular, what is the content of r1? +// +// Confer to the documentation: +// https://developer.arm.com/documentation/ddi0439/b/Data-Watchpoint-and-Trace-Unit/DWT-Programmers-Model +// +// [Your answer here] +// +// Now check your answer by dumping the registers +// (gdb) info registers +// +// [Register dump here] +// +// We can now set a breakpoint exactly at the `nop`. +// +// (gdb) break *0x8000240 +// +// And continue execution to the breakpoint: +// (gdb) continue +// +// (gdb) disassemble +// 0x08000232 <+0>: movw r1, #4100 ; 0x1004 +// 0x08000236 <+4>: movw r2, #10000 ; 0x2710 +// 0x0800023a <+8>: movt r1, #57344 ; 0xe000 +// 0x0800023e <+12>: ldr r0, [r1, #0] +// 0x08000240 <+14>: subs r2, #1 +// => 0x08000242 <+16>: nop +// 0x08000244 <+18>: bne.n 0x8000240 <rtt_timing::timed_loop+14> +// 0x08000246 <+20>: ldr r1, [r1, #0] +// 0x08000248 <+22>: bx lr +// +// You can inspect the memory directly. +// What is the current value of the cycle counter? +// +// (gdb) x 0xe0001004 +// +// [Your answer here] +// +// Now, let's execute one iteration: +// (gdb) continue +// +// What is now the current value of the cycle counter? +// +// [Your answer here] +// +// By how much does the cycle counter increase for each iteration? +// +// [Your answer here] +// +// ------------------------------------------------------------------------ +// F) Reseting the cycle counter +// You can restart the program by a "soft reset". +// (gdb) monitor reset init +// "monitor" the command to be sent to `openocd` +// "reset init" will reset the MCU and run a the "init" hook +// in the currently loaded `openocd` configuration. +// +// Now repeat the procedure from E) and check the value of +// the cycle counter. +// +// You will find that its not starting over but continuing. +// This may be a bit confusing (we like deterministic behavior). +// +// Look in the documentation for `cortex_m` +// https://docs.rs/cortex-m/0.6.4/cortex_m/peripheral/dwt/struct.RegisterBlock.html +// +// This is a "raw" API to the underlying hardware. +// Under the hood, it uses `volatile_register` +// https://docs.rs/volatile-register/0.2.0/volatile_register/struct.RW.html +// +// for which write is marked "unsafe". +// +// Figure out the way to access the register through: +// `cx. cx.core.DWT.cyccnt.write(0)`. +// And add that to the code just before calling `timed_loop`. +// +// (Remember to comment out #![deny(unsafe_code)]) +// +// Now run re-run the procedure from E) +// +// What is the initial value of the cycle counter +// (when hitting the `timed_loop` breakpoint)? +// +// [Your answer here] +// +// ------------------------------------------------------------------------ +// F) Finally some statics +// For embedded it is often/typically crucial to reduce overhead, +// - code footprint (flash memory) +// - memory footprint (sram memory) +// - execution OH (speed/power consumption) +// +// We have already looked execution overhead and can conclude +// Rust to do an excellent job. +// +// The ram requirement for the application amounts only to +// to the buffers (allocated for RTT), and the stack +// for the functions (which is in this case 0 for the `timed_loop`). +// +// Let's have a look at the code footprint (flash). +// +// > cargo bloat -n 100 --example rtt_timing --release --features nightly +// ... +// 0.0% 0.4% 24B [Unknown] timed_loop +// ... +// +// That is, the timed loop takes only 24 bytes of memory! +// +// > cargo size --example rtt_timing --release --features nightly +// Finished release [optimized + debuginfo] target(s) in 0.03s +// text data bss dec hex filename +// 6876 0 1084 7960 1f18 rtt_timing +// +// And the code overall less than 8k of flash. +// +// Your assignment now is to get this down, by identifying the +// memory hogs. You can remove the tracing, replace the panic +// handler by `panic_halt`, but besides that it should still +// measure time (debugging in gdb of `timed_loop` must still work). +// +// ------------------------------------------------------------------------ +// Summary: +// What are the learning outcomes. +// +// - Rust generates really bad code in debug build. +// - Rust generates really, really, really good code in release build. +// Zero-cost abstractions are for real. +// +// - RTIC is extremely light weight (zero-cost in release build). +// - RTIC applications are easy to -- GitLab