//! examples/rtt_timing.rs

#![deny(unsafe_code)]
#![deny(warnings)]
#![no_main]
#![no_std]

use cortex_m::{asm, peripheral::DWT};
use panic_rtt_target as _;
use rtt_target::{rprintln, rtt_init_print};
use stm32f4;

#[rtic::app(device = stm32f4)]
const APP: () = {
    #[init]
    fn init(mut cx: init::Context) {
        rtt_init_print!();
        rprintln!("init");

        // Initialize (enable) the monotonic timer (CYCCNT)
        cx.core.DCB.enable_trace();
        cx.core.DWT.enable_cycle_counter();

        rprintln!("start timed_loop");
        let (start, end) = timed_loop();
        rprintln!(
            "start {}, end {}, diff {}",
            start,
            end,
            end.wrapping_sub(start)
        );
    }

    #[idle]
    fn idle(_cx: idle::Context) -> ! {
        rprintln!("idle");
        loop {
            continue;
        }
    }
};

// Forbid inlining and keeping name (symbol) readable.
#[inline(never)]
#[no_mangle]
fn timed_loop() -> (u32, u32) {
    let start = DWT::get_cycle_count();
    for _ in 0..10000 {
        asm::nop();
    }
    let end = DWT::get_cycle_count();
    (start, end)
}

// We can measure execution time using the built in cycle counter,
// a wrapping 32 bit timer.
//
// `DWT::get_cycle_count()` reads the current value.
//
// ------------------------------------------------------------------------
// Exercises:
//
// A.1) What is the cycle count for the loop?
// > cargo run --example rtt_timing
//
// [Your answer here]
//
// A.2) How many cycles per iteration?
//
// [Your answer here]
//
// A.3) Why do we need a wrapping subtraction?
//
// [Your answer here]
//
// ------------------------------------------------------------------------
// Now try a release (optimized build, see `Cargo.toml` for build options).
// B.1) What is the cycle count for the loop?
// > cargo run --example rtt_timing --release
//
// [Your answer here]
//
// B.2) How many cycles per iteration?
//
// [Your answer here]
//
// What is the speedup (A/B)?
//
// [Your answer here]
//
// Why do you think it differs that much?
//
// [Your answer here]
//
// ------------------------------------------------------------------------
// In the loop there is just a single assembly instruction (nop).
// However, the API for "inline assembly" is not yet stabilized, thus
// we have to call an "external" functions that implements the assembly.
//
// We can switch to the "nightly" channel (which allows experimental features).
//
// > rustup override set nightly
// info: using existing install for 'nightly-x86_64-unknown-linux-gnu'
// info: override toolchain for '/home/pln/courses/e7020e/app' set to 'nightly-x86_64-unknown-linux-gnu'
//
// nightly-x86_64-unknown-linux-gnu unchanged - rustc 1.49.0-nightly (cf9cf7c92 2020-11-10)
//
// > rustup target add thumbv7em-none-eabi
// (only needed first time)
//
// Now try a release (optimized build, see `Cargo.toml` for build options).
// C.1) What is the cycle count for the loop?
// > cargo run --example rtt_timing --release --features nightly
//
// [Your answer here]
//
// C.2) How many cycles per iteration?
//
// [Your answer here]
//
// What is the speedup (A/C)?
//
// [Your answer here]
//
// ------------------------------------------------------------------------
// D) Now lets have a closer look at the generated assembly.
//
// > cargo objdump --example rtt_timing --release  --features nightly -- --disassemble --no-show-raw-insn > rtt_timing.objdump
//
// Open the file in you editor and search for the `timed_loop`.
//
// [Assembly for function `timed_loop` here]
//
// Locate the loop body, and verify that it makes sense
// based on the information from the technical documentation:
//
// https://developer.arm.com/documentation/ddi0439/b/Programmers-Model/Instruction-set-summary/Cortex-M4-instructions
//
//
// ------------------------------------------------------------------------
// E) Now we shall take detailed control over the debugging.
//
// Alter the .cargo/config
// uncomment:
// runner = "arm-none-eabi-gdb -q -x openocd.gdb"
// or (if under ubuntu)
// runner = "gdb-multiarch -q -x openocd.gdb"
//
// comment:
// # runner = "probe-run --chip STM32F411RETx"
//
// Remember to save the file!
//
// In a separate terminal start `openocd`.
// > openocd -f openocd.cfg
// And in the main terminal run:
// > cargo run --example rtt_timing --release --features nightly
// ...
// halted: PC: 0x0800019a
// 0x0800019a in cortex_m_rt::Reset () at /home/pln/.cargo/registry/src/github.com-1ecc6299db9ec823/cortex-m-rt-0.6.13/src/lib.rs:497
// 497     pub unsafe extern "C" fn Reset() -> ! {
//
// Now add a breakpoint at `timed_loop`.
// (gdb) break timed_loop
// Breakpoint 5 at 0x800023e: file examples/rtt_timing.rs, line 46.
//
// Now we can continue (first break will be at `init`)
// (gdb) continue
// Breakpoint 4, cortex_m::interrupt::disable () at /home/pln/.cargo/registry/src/github.com-1ecc6299db9ec823/cortex-m-0.6.4/src/interrupt.rs:13
//                  llvm_asm!("cpsid i" ::: "memory" : "volatile");
// (`init` in RTIC runs with interrupts disabled)
// (gdb) continue
// Breakpoint 5, core::ptr::read_volatile<u32> (src=<optimized out>) at /home/pln/.rustup/toolchains/nightly-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/ptr/mod.rs:1052
//        unsafe { intrinsics::volatile_load(src) }
//
// Now we are inside the `timed_loop`.
// (gdb) disassemble
// Dump of assembler code for function _ZN10rtt_timing10timed_loop17h4a445e5f592f3304E:
// 0x08000232 <+0>:     movw    r1, #4100       ; 0x1004
// 0x08000236 <+4>:     movw    r2, #10000      ; 0x2710
// 0x0800023a <+8>:     movt    r1, #57344      ; 0xe000
// => 0x0800023e <+12>:    ldr     r0, [r1, #0]
// 0x08000240 <+14>:    subs    r2, #1
// 0x08000242 <+16>:    nop
// 0x08000244 <+18>:    bne.n   0x8000240 <rtt_timing::timed_loop+14>
// 0x08000246 <+20>:    ldr     r1, [r1, #0]
// 0x08000248 <+22>:    bx      lr
//
// (breakpoints can be a bit "off" for release builds, here it
// put the breakpoint not at the function entry but rather at the
// first Rust line (the load done by `DWT::get_cycle_count()`)).
//
// Is there a function call to the DWT::get_cycle_count(),
// if not can you explain what has happened here?
// In particular, what is the content of r1?
//
// Confer to the documentation:
// https://developer.arm.com/documentation/ddi0439/b/Data-Watchpoint-and-Trace-Unit/DWT-Programmers-Model
//
// [Your answer here]
//
// Now check your answer by dumping the registers
// (gdb) info registers
//
// [Register dump here]
//
// We can now set a breakpoint exactly at the `nop`.
//
// (gdb) break *0x8000242
//
// And continue execution to the breakpoint:
// (gdb) continue
//
// (gdb) disassemble
//    0x08000232 <+0>:     movw    r1, #4100       ; 0x1004
// 0x08000236 <+4>:     movw    r2, #10000      ; 0x2710
// 0x0800023a <+8>:     movt    r1, #57344      ; 0xe000
// 0x0800023e <+12>:    ldr     r0, [r1, #0]
// 0x08000240 <+14>:    subs    r2, #1
// => 0x08000242 <+16>:    nop
// 0x08000244 <+18>:    bne.n   0x8000240 <rtt_timing::timed_loop+14>
// 0x08000246 <+20>:    ldr     r1, [r1, #0]
// 0x08000248 <+22>:    bx      lr
//
// You can inspect the memory directly.
// What is the current value of the cycle counter?
//
// (gdb) x 0xe0001004
//
// [Your answer here]
//
// Now, let's execute one iteration:
// (gdb) continue
//
// What is now the current value of the cycle counter?
//
// [Your answer here]
//
// By how much does the cycle counter increase for each iteration?
//
// [Your answer here]
//
// ------------------------------------------------------------------------
// F) Reseting the cycle counter
// You can restart the program by a "soft reset".
// (gdb) monitor reset init
// "monitor" the command to be sent to `openocd`
// "reset init" will reset the MCU and run a the "init" hook
// in the currently loaded `openocd` configuration.
//
// Now repeat the procedure from E) and check the value of
// the cycle counter.
//
// You will find that its not starting over but continuing.
// This may be a bit confusing (we like deterministic behavior).
//
// Look in the documentation for `cortex_m`
// https://docs.rs/cortex-m/0.6.4/cortex_m/peripheral/dwt/struct.RegisterBlock.html
//
// This is a "raw" API to the underlying hardware.
// Under the hood, it uses `volatile_register`
// https://docs.rs/volatile-register/0.2.0/volatile_register/struct.RW.html
//
// for which write is marked "unsafe".
//
// Figure out the way to access the register through:
// `cx.core.DWT.cyccnt.write(0)`.
// And add that to the code just before calling `timed_loop`.
//
// (Remember to comment out #![deny(unsafe_code)])
//
// Now run re-run the procedure from E)
//
// What is the initial value of the cycle counter
// (when hitting the `timed_loop` breakpoint)?
//
// [Your answer here]
//
// ------------------------------------------------------------------------
// F) Finally some statics
// For embedded it is often/typically crucial to reduce overhead,
// - code footprint (flash memory)
// - memory footprint (sram memory)
// - execution OH (speed/power consumption)
//
// We have already looked execution overhead and can conclude
// Rust to do an excellent job.
//
// The ram requirement for the application amounts only to
// to the buffers (allocated for RTT), and the stack
// for the functions (which is in this case 0 for the `timed_loop`).
// (Look at the code, there is no pushing/popping, no local stack.)
//
// Let's have a look at the code footprint (flash).
//
// > cargo bloat -n 100 --example rtt_timing --release --features nightly
// ...
// 0.0%   0.4%    24B   [Unknown] timed_loop
// ...
//
// That is, the timed loop takes only 24 bytes of memory!
//
// > cargo size --example rtt_timing --release --features nightly
// Finished release [optimized + debuginfo] target(s) in 0.03s
// text    data     bss     dec     hex filename
// 6876       0    1084    7960    1f18 rtt_timing
//
// And the code overall less than 8k of flash.
//
// Your assignment now is to get this down, by identifying the
// memory hogs. You can remove the tracing, replace the panic
// handler by `panic_halt`, but besides that it should still
// measure time (debugging in gdb of `timed_loop` must still work).
//
// > cargo size --example rtt_timing --release --features nightly
//
// [Your answer here]
//
// I was able to get down to:
// > cargo size --example rtt_timing --release --features nightly
// Compiling app v0.1.0 (/home/pln/courses/e7020e/app)
// Finished release [optimized + debuginfo] target(s) in 0.32s
// text    data     bss     dec     hex filename
// 660       0       0     660     294 rtt_timing
// Try beat that...
// ------------------------------------------------------------------------
// Summary:
// What are the learning outcomes.
//
// - You have confirmed that RTIC is extremely light weight
//   (zero-cost in release build).
//   Applications can be be less than 1k flash. Ideal for IoT!
//
// - You have seen that RTIC applications are easy to trace using RTT
//   If more details are required, you have learned to use gdb.
//
// - You have confirmed that Rust generates:
//   really, really, bad code in debug build (beware!).
//   really, really, really good code in release build!
//
// - You have setup timing measurements which are
//   Cycle accurate (0.000 000 01s at 100MHz).
//   Consistent (down to a single clock cycle).
//   Predictable (you got what you expected right?)