From 6845790ab88234386579a9d7aa571bd6ee50799c Mon Sep 17 00:00:00 2001
From: Per Lindgren <per.lindgren@ltu.se>
Date: Wed, 11 Nov 2020 22:39:09 +0100
Subject: [PATCH] rtt_timing

---
 examples/rtt_simple.rs |  43 ++++++
 examples/rtt_timing.rs | 319 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 362 insertions(+)
 create mode 100644 examples/rtt_simple.rs
 create mode 100644 examples/rtt_timing.rs

diff --git a/examples/rtt_simple.rs b/examples/rtt_simple.rs
new file mode 100644
index 0000000..c9005a4
--- /dev/null
+++ b/examples/rtt_simple.rs
@@ -0,0 +1,43 @@
+//! examples/rtt_simple.rs
+
+#![deny(unsafe_code)]
+#![deny(warnings)]
+#![no_main]
+#![no_std]
+
+use panic_rtt_target as _;
+use rtt_target::{rprintln, rtt_init_print};
+use stm32f4;
+
+#[rtic::app(device = stm32f4)]
+const APP: () = {
+    #[init]
+    fn init(_cx: init::Context) {
+        rtt_init_print!();
+        rprintln!("init");
+    }
+
+    #[idle]
+    fn idle(_cx: idle::Context) -> ! {
+        rprintln!("idle");
+        loop {
+            continue;
+        }
+    }
+};
+
+// > cargo run rtt_simple
+//    Compiling app v0.1.0 (/home/pln/courses/e7020e/app)
+// Finished dev [unoptimized + debuginfo] target(s) in 0.15s
+// Running `probe-run --chip STM32F411RETx target/thumbv7em-none-eabi/debug/examples/rtt_simple`
+// flashing program ..
+// DONE
+// resetting device
+// init
+// idle
+//
+// > [Ctrl-C]
+// stack backtrace:
+// 0: 0x0800031a - rtt_simple::idle
+// 1: 0x0800036c - main
+// 2: 0x080001da - Reset
diff --git a/examples/rtt_timing.rs b/examples/rtt_timing.rs
new file mode 100644
index 0000000..7f0c575
--- /dev/null
+++ b/examples/rtt_timing.rs
@@ -0,0 +1,319 @@
+//! examples/rtt_timing.rs
+
+// #![deny(unsafe_code)]
+#![deny(warnings)]
+#![no_main]
+#![no_std]
+
+use cortex_m::{asm, peripheral::DWT};
+use panic_rtt_target as _;
+use rtt_target::{rprintln, rtt_init_print};
+use stm32f4;
+
+#[rtic::app(device = stm32f4)]
+const APP: () = {
+    #[init]
+    fn init(mut cx: init::Context) {
+        rtt_init_print!();
+        rprintln!("init");
+
+        // Initialize (enable) the monotonic timer (CYCCNT)
+        cx.core.DCB.enable_trace();
+        cx.core.DWT.enable_cycle_counter();
+
+        rprintln!("start timed_loop");
+        let (start, end) = timed_loop();
+        rprintln!(
+            "start {}, end {}, diff {}",
+            start,
+            end,
+            end.wrapping_sub(start)
+        );
+    }
+
+    #[idle]
+    fn idle(_cx: idle::Context) -> ! {
+        rprintln!("idle");
+        loop {
+            continue;
+        }
+    }
+};
+
+// Forbid inlining and keeping name (symbol) readable.
+#[inline(never)]
+#[no_mangle]
+fn timed_loop() -> (u32, u32) {
+    let start = DWT::get_cycle_count();
+    for _ in 0..10000 {
+        asm::nop();
+    }
+    let end = DWT::get_cycle_count();
+    (start, end)
+}
+
+// We can measure execution time using the built in cycle counter,
+// a wrapping 32 bit timer.
+//
+// `DWT::get_cycle_count()` reads the current value.
+//
+// ------------------------------------------------------------------------
+// Exercises:
+//
+// A.1) What is the cycle count for the loop?
+// > cargo run --example rtt_timing
+//
+// [Your answer here]
+//
+// A.2) How many cycles per iteration?
+//
+// [Your answer here]
+//
+// ------------------------------------------------------------------------
+// Now try a release (optimized build, see `Cargo.toml` for build options).
+// B.1) What is the cycle count for the loop?
+// > cargo run --example rtt_timing --release
+//
+// [Your answer here]
+//
+// B.2) How many cycles per iteration?
+//
+// [Your answer here]
+//
+// What is the speedup (A/B)?
+//
+// [Your answer here]
+//
+// Why do you think it differs that much?
+//
+// [Your answer here]
+//
+// ------------------------------------------------------------------------
+// In the loop there is just a single assembly instruction (nop).
+// However, the API for "inline assembly" is not yet stabilized, thus
+// we have to call an "external" functions that implements the assembly.
+//
+// We can switch to the "nightly" channel (which allows experimental features).
+//
+// > rustup override set nightly
+// info: using existing install for 'nightly-x86_64-unknown-linux-gnu'
+// info: override toolchain for '/home/pln/courses/e7020e/app' set to 'nightly-x86_64-unknown-linux-gnu'
+//
+//  nightly-x86_64-unknown-linux-gnu unchanged - rustc 1.49.0-nightly (cf9cf7c92 2020-11-10)
+//
+// > rustup target add thumbv7em-none-eabi
+// (only needed first time)
+//
+// Now try a release (optimized build, see `Cargo.toml` for build options).
+// C.1) What is the cycle count for the loop?
+// > cargo run --example rtt_timing --release
+//
+// [Your answer here]
+//
+// C.2) How many cycles per iteration?
+//
+// [Your answer here]
+//
+// What is the speedup (A/C)?
+//
+// [Your answer here]
+//
+// ------------------------------------------------------------------------
+// D) Now lets have a closer look at the generated assembly.
+//
+// > cargo objdump --example rtt_timing --release  --features nightly -- --disassemble > rtt_timing.objdump
+//
+// Open the file in you editor and search for the `timed_loop`.
+//
+// [Assembly for function here]
+//
+// Locate the loop body, and verify that it makes sense
+// based on the information from the technical documentation:
+//
+// https://developer.arm.com/documentation/ddi0439/b/Programmers-Model/Instruction-set-summary/Cortex-M4-instructions
+//
+//
+// ------------------------------------------------------------------------
+// E) Now we shall take detailed control over the debugging.
+//
+// Alter the .cargo/config
+// uncomment:
+// runner = "arm-none-eabi-gdb -q -x openocd.gdb"
+// or (if under ubuntu)
+// runner = "gdb-multiarch -q -x openocd.gdb"
+//
+// comment:
+// # runner = "probe-run --chip STM32F411RETx"
+//
+// Remember to save the file!
+//
+// In a separate terminal start `openocd`.
+// > openocd -f openocd.cfg
+// And in the main terminal run:
+// > cargo run --example rtt_timing --release --features nightly
+// ...
+// halted: PC: 0x0800019a
+// 0x0800019a in cortex_m_rt::Reset () at /home/pln/.cargo/registry/src/github.com-1ecc6299db9ec823/cortex-m-rt-0.6.13/src/lib.rs:497
+// 497     pub unsafe extern "C" fn Reset() -> ! {
+//
+// Now add a breakpoint at `timed_loop`.
+// (gdb) break timed_loop
+// Breakpoint 5 at 0x800023e: file examples/rtt_timing.rs, line 45.
+//
+// Now we can continue (first break will be at `init`)
+// (gdb) continue
+// Breakpoint 4, cortex_m::interrupt::disable () at /home/pln/.cargo/registry/src/github.com-1ecc6299db9ec823/cortex-m-0.6.4/src/interrupt.rs:13
+//                  llvm_asm!("cpsid i" ::: "memory" : "volatile");
+// (`init` in RTIC runs with interrupts disabled)
+// (gdb) continue
+// Breakpoint 5, core::ptr::read_volatile<u32> (src=<optimized out>) at /home/pln/.rustup/toolchains/nightly-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/ptr/mod.rs:1052
+//        unsafe { intrinsics::volatile_load(src) }
+//
+// Now we are inside the `timed_loop`.
+// disassemble
+// Dump of assembler code for function _ZN10rtt_timing10timed_loop17h4a445e5f592f3304E:
+// 0x08000232 <+0>:     movw    r1, #4100       ; 0x1004
+// 0x08000236 <+4>:     movw    r2, #10000      ; 0x2710
+// 0x0800023a <+8>:     movt    r1, #57344      ; 0xe000
+// => 0x0800023e <+12>:    ldr     r0, [r1, #0]
+// 0x08000240 <+14>:    subs    r2, #1
+// 0x08000242 <+16>:    nop
+// 0x08000244 <+18>:    bne.n   0x8000240 <rtt_timing::timed_loop+14>
+// 0x08000246 <+20>:    ldr     r1, [r1, #0]
+// 0x08000248 <+22>:    bx      lr
+//
+// (breakpoints can be a bit "off" for release builds, here it
+// put the breakpoint not at the function entry but rather at the
+// first Rust line (the load done by `DWT::get_cycle_count()`)).
+//
+// Is there a function call to the DWT::get_cycle_count(),
+// if not can you explain what has happened here?
+// In particular, what is the content of r1?
+//
+// Confer to the documentation:
+// https://developer.arm.com/documentation/ddi0439/b/Data-Watchpoint-and-Trace-Unit/DWT-Programmers-Model
+//
+// [Your answer here]
+//
+// Now check your answer by dumping the registers
+// (gdb) info registers
+//
+// [Register dump here]
+//
+// We can now set a breakpoint exactly at the `nop`.
+//
+// (gdb) break *0x8000240
+//
+// And continue execution to the breakpoint:
+// (gdb) continue
+//
+// (gdb) disassemble
+//    0x08000232 <+0>:     movw    r1, #4100       ; 0x1004
+// 0x08000236 <+4>:     movw    r2, #10000      ; 0x2710
+// 0x0800023a <+8>:     movt    r1, #57344      ; 0xe000
+// 0x0800023e <+12>:    ldr     r0, [r1, #0]
+// 0x08000240 <+14>:    subs    r2, #1
+// => 0x08000242 <+16>:    nop
+// 0x08000244 <+18>:    bne.n   0x8000240 <rtt_timing::timed_loop+14>
+// 0x08000246 <+20>:    ldr     r1, [r1, #0]
+// 0x08000248 <+22>:    bx      lr
+//
+// You can inspect the memory directly.
+// What is the current value of the cycle counter?
+//
+// (gdb) x 0xe0001004
+//
+// [Your answer here]
+//
+// Now, let's execute one iteration:
+// (gdb) continue
+//
+// What is now the current value of the cycle counter?
+//
+// [Your answer here]
+//
+// By how much does the cycle counter increase for each iteration?
+//
+// [Your answer here]
+//
+// ------------------------------------------------------------------------
+// F) Reseting the cycle counter
+// You can restart the program by a "soft reset".
+// (gdb) monitor reset init
+// "monitor" the command to be sent to `openocd`
+// "reset init" will reset the MCU and run a the "init" hook
+// in the currently loaded `openocd` configuration.
+//
+// Now repeat the procedure from E) and check the value of
+// the cycle counter.
+//
+// You will find that its not starting over but continuing.
+// This may be a bit confusing (we like deterministic behavior).
+//
+// Look in the documentation for `cortex_m`
+// https://docs.rs/cortex-m/0.6.4/cortex_m/peripheral/dwt/struct.RegisterBlock.html
+//
+// This is a "raw" API to the underlying hardware.
+// Under the hood, it uses `volatile_register`
+// https://docs.rs/volatile-register/0.2.0/volatile_register/struct.RW.html
+//
+// for which write is marked "unsafe".
+//
+// Figure out the way to access the register through:
+// `cx. cx.core.DWT.cyccnt.write(0)`.
+// And add that to the code just before calling `timed_loop`.
+//
+// (Remember to comment out #![deny(unsafe_code)])
+//
+// Now run re-run the procedure from E)
+//
+// What is the initial value of the cycle counter
+// (when hitting the `timed_loop` breakpoint)?
+//
+// [Your answer here]
+//
+// ------------------------------------------------------------------------
+// F) Finally some statics
+// For embedded it is often/typically crucial to reduce overhead,
+// - code footprint (flash memory)
+// - memory footprint (sram memory)
+// - execution OH (speed/power consumption)
+//
+// We have already looked execution overhead and can conclude
+// Rust to do an excellent job.
+//
+// The ram requirement for the application amounts only to
+// to the buffers (allocated for RTT), and the stack
+// for the functions (which is in this case 0 for the `timed_loop`).
+//
+// Let's have a look at the code footprint (flash).
+//
+// > cargo bloat -n 100 --example rtt_timing --release --features nightly
+// ...
+// 0.0%   0.4%    24B   [Unknown] timed_loop
+// ...
+//
+// That is, the timed loop takes only 24 bytes of memory!
+//
+// > cargo size --example rtt_timing --release --features nightly
+// Finished release [optimized + debuginfo] target(s) in 0.03s
+// text    data     bss     dec     hex filename
+// 6876       0    1084    7960    1f18 rtt_timing
+//
+// And the code overall less than 8k of flash.
+//
+// Your assignment now is to get this down, by identifying the
+// memory hogs. You can remove the tracing, replace the panic
+// handler by `panic_halt`, but besides that it should still
+// measure time (debugging in gdb of `timed_loop` must still work).
+//
+// ------------------------------------------------------------------------
+// Summary:
+// What are the learning outcomes.
+//
+// - Rust generates really bad code in debug build.
+// - Rust generates really, really, really good code in release build.
+//   Zero-cost abstractions are for real.
+//
+// - RTIC is extremely light weight (zero-cost in release build).
+// - RTIC applications are easy to
-- 
GitLab