Show detailed icount diff for scenarios with noteworthy diffs

author: Adolfo Ochagavía <github@adolfo.ochagavia.nl> 2023-09-05 11:25:36 +0200
committer: ctz <jpixton@gmail.com> 2023-09-05 13:34:27 +0000
commit: 8c914559f8b31c2db642322a8c87b95ccfd9b2f3 (patch)
tree: abb76f1b91579ccb4709a152e80b6faf6de0b349 /ci-bench
parent: c9a397446231854c5af439081875ef53fe4339c8 (diff)
3 files changed, 167 insertions, 43 deletions
diff --git a/ci-bench/README.md b/ci-bench/README.md
index 3187adac..d869c484 100644
--- a/ci-bench/README.md
+++ b/ci-bench/README.md
@@ -12,8 +12,9 @@ important bits.
 
 _Note: this step requires having `valgrind` in your path._
 
-Use `cargo run --release -- run-all > out.csv` to generate a CSV with the instruction counts for
-the different scenarios we support. The result should look like the following:
+Use `cargo run --release -- run-all --output-dir foo` to generate the results inside the `foo`
+directory. Within that directory, you will find an `icounts.csv` file with the instruction counts
+for the different scenarios we support. It should look like the following:
 
 ```csv
 handshake_no_resume_1.2_rsa_aes_server,11327015
@@ -31,12 +32,14 @@ handshake_no_resume_1.3_rsa_aes_client,4212770
 ...
 ```
 
+In the `cachegrind` subdirectory you will find output files emitted by the `cachegrind` tool, which
+are useful to report detailed instruction count differences when comparing two benchmark runs.
+
 ### Comparing results
 
-Use `cargo run --release -- compare out1.csv out2.csv`. It will output a report using
-GitHub-flavored markdown (used by the CI itself to give feedback about PRs). We currently
-consider differences of 0.2% to be significant, but might tweak it in the future after we gain
-experience with the benchmarking setup.
+Use `cargo run --release -- compare foo bar`. It will output a report using GitHub-flavored markdown
+(used by the CI itself to give feedback about PRs). We currently consider differences of 0.2% to be
+significant, but might tweak it in the future after we gain experience with the benchmarking setup.
 
 ### Supported scenarios
 
diff --git a/ci-bench/src/cachegrind.rs b/ci-bench/src/cachegrind.rs
index e674fbf0..064aeeb1 100644
--- a/ci-bench/src/cachegrind.rs
+++ b/ci-bench/src/cachegrind.rs
@@ -9,12 +9,17 @@ use anyhow::Context;
 use crate::benchmark::Benchmark;
 use crate::Side;
 
+/// The subdirectory in which the cachegrind output should be stored
+const CACHEGRIND_OUTPUT_SUBDIR: &str = "cachegrind";
+
 /// A cachegrind-based benchmark runner
 pub struct CachegrindRunner {
     /// The path to the ci-bench executable
     ///
     /// This is necessary because the cachegrind runner works by spawning child processes
     executable: String,
+    /// The directory where the cachegrind output will be stored
+    output_dir: PathBuf,
     /// The amount of instructions that are executed upon startup of the child process, before
     /// actually running one of the benchmarks
     ///
@@ -24,9 +29,13 @@ pub struct CachegrindRunner {
 
 impl CachegrindRunner {
     /// Returns a new cachegrind-based benchmark runner
-    pub fn new(executable: String) -> anyhow::Result<Self> {
+    pub fn new(executable: String, output_dir: PathBuf) -> anyhow::Result<Self> {
         Self::ensure_cachegrind_available()?;
 
+        let cachegrind_output_dir = output_dir.join(CACHEGRIND_OUTPUT_SUBDIR);
+        std::fs::create_dir_all(&cachegrind_output_dir)
+            .context("Failed to create cachegrind output directory")?;
+
         // We don't care about the side here, so let's use `Server` just to choose something
         let overhead_instructions = Self::run_bench_side(
             &executable,
@@ -35,12 +44,14 @@ impl CachegrindRunner {
             "calibration",
             Stdio::piped(),
             Stdio::piped(),
+            &cachegrind_output_dir,
         )?
         .wait_and_get_instr_count()
         .context("Unable to count overhead instructions")?;
 
         Ok(CachegrindRunner {
             executable,
+            output_dir: cachegrind_output_dir,
             overhead_instructions,
         })
     }
@@ -61,6 +72,7 @@ impl CachegrindRunner {
             &bench.name_with_side(Side::Server),
             Stdio::piped(),
             Stdio::piped(),
+            &self.output_dir,
         )
         .context("server side bench crashed")?;
 
@@ -71,6 +83,7 @@ impl CachegrindRunner {
             &bench.name_with_side(Side::Client),
             Stdio::from(server.process.stdout.take().unwrap()),
             Stdio::from(server.process.stdin.take().unwrap()),
+            &self.output_dir,
         )
         .context("client side bench crashed")?;
 
@@ -116,10 +129,9 @@ impl CachegrindRunner {
         name: &str,
         stdin: Stdio,
         stdout: Stdio,
+        output_dir: &Path,
     ) -> anyhow::Result<BenchSubprocess> {
-        let output_file = PathBuf::from(format!("target/cachegrind/cachegrind.out.{}", name));
-        std::fs::create_dir_all(output_file.parent().unwrap())
-            .context("Failed to create cachegrind output directory")?;
+        let cachegrind_output_file = output_dir.join(name);
 
         // Run under setarch to disable ASLR, to reduce noise
         let mut cmd = Command::new("setarch");
@@ -133,7 +145,10 @@ impl CachegrindRunner {
             // keep stderr free of noise, to see any errors from the child process)
             .arg("--log-file=/dev/null")
             // The file where the instruction counts will be stored
-            .arg(format!("--cachegrind-out-file={}", output_file.display()))
+            .arg(format!(
+                "--cachegrind-out-file={}",
+                cachegrind_output_file.display()
+            ))
             .arg(executable)
             .arg("run-single")
             .arg(benchmark_index.to_string())
@@ -146,7 +161,7 @@ impl CachegrindRunner {
 
         Ok(BenchSubprocess {
             process: child,
-            output_file,
+            cachegrind_output_file,
         })
     }
 }
@@ -156,7 +171,7 @@ struct BenchSubprocess {
     /// The benchmark's child process, running under cachegrind
     process: Child,
     /// Cachegrind's output file for this benchmark
-    output_file: PathBuf,
+    cachegrind_output_file: PathBuf,
 }
 
 impl BenchSubprocess {
@@ -173,9 +188,7 @@ impl BenchSubprocess {
             );
         }
 
-        let instruction_count = parse_cachegrind_output(&self.output_file)?;
-        std::fs::remove_file(&self.output_file).ok();
-
+        let instruction_count = parse_cachegrind_output(&self.cachegrind_output_file)?;
         Ok(instruction_count)
     }
 }
@@ -216,3 +229,69 @@ impl Sub for InstructionCounts {
         }
     }
 }
+
+/// Returns the detailed instruction diff between the baseline and the candidate
+pub fn diff(baseline: &Path, candidate: &Path, scenario: &str) -> anyhow::Result<String> {
+    // The latest version of valgrind has deprecated cg_diff, which has been superseded by
+    // cg_annotate. Many systems are running older versions, though, so we are sticking with cg_diff
+    // for the time being.
+
+    let tmp_path = Path::new("target/ci-bench-tmp");
+    let tmp = File::create(tmp_path).context("cannot create temp file for cg_diff")?;
+
+    // cg_diff generates a diff between two cachegrind output files in a custom format that is not
+    // user-friendly
+    let cg_diff = Command::new("cg_diff")
+        .arg(
+            baseline
+                .join(CACHEGRIND_OUTPUT_SUBDIR)
+                .join(scenario),
+        )
+        .arg(
+            candidate
+                .join(CACHEGRIND_OUTPUT_SUBDIR)
+                .join(scenario),
+        )
+        .stdout(Stdio::from(tmp))
+        .spawn()
+        .context("cannot spawn cg_diff subprocess")?
+        .wait()
+        .context("error waiting for cg_diff to finish")?;
+
+    if !cg_diff.success() {
+        anyhow::bail!(
+            "cg_diff finished with an error (code = {:?})",
+            cg_diff.code()
+        )
+    }
+
+    // cg_annotate transforms the output of cg_diff into something a user can understand
+    let cg_annotate = Command::new("cg_annotate")
+        .arg(tmp_path)
+        .arg("--auto=no")
+        .output()
+        .context("error waiting for cg_annotate to finish")?;
+
+    if !cg_annotate.status.success() {
+        anyhow::bail!(
+            "cg_annotate finished with an error (code = {:?})",
+            cg_annotate.status.code()
+        )
+    }
+
+    let annotated =
+        String::from_utf8(cg_annotate.stdout).context("cg_annotate produced invalid UTF8")?;
+
+    // Discard lines before the first `Ir` header
+    let mut diff = String::new();
+    for line in annotated
+        .trim()
+        .lines()
+        .skip_while(|l| l.trim() != "Ir")
+    {
+        diff.push_str(line);
+        diff.push('\n');
+    }
+
+    Ok(diff)
+}
diff --git a/ci-bench/src/main.rs b/ci-bench/src/main.rs
index 066b7187..5b0e8dad 100644
--- a/ci-bench/src/main.rs
+++ b/ci-bench/src/main.rs
@@ -56,6 +56,9 @@ const RESUMED_HANDSHAKE_RUNS: usize = 30;
 /// The threshold at which instruction count changes are considered relevant
 const CHANGE_THRESHOLD: f64 = 0.002; // 0.2%
 
+/// The name of the file where the instruction counts are stored after a `run-all` run
+const ICOUNTS_FILENAME: &str = "icounts.csv";
+
 #[derive(Parser)]
 #[command(about)]
 pub struct Cli {
@@ -66,15 +69,18 @@ pub struct Cli {
 #[derive(Subcommand)]
 pub enum Command {
     /// Run all benchmarks and prints the measured CPU instruction counts in CSV format
-    RunAll,
+    RunAll {
+        #[arg(short, long)]
+        output_dir: Option<PathBuf>,
+    },
     /// Run a single benchmark at the provided index (used by the bench runner to start each benchmark in its own process)
     RunSingle { index: u32, side: Side },
     /// Compare the results from two previous benchmark runs and print a user-friendly markdown overview
     Compare {
-        /// Path to a CSV file obtained from a previous `run-all` execution
-        baseline_input: PathBuf,
-        /// Path to a CSV file obtained from a previous `run-all` execution
-        candidate_input: PathBuf,
+        /// Path to the directory with the results of a previous `run-all` execution
+        baseline_dir: PathBuf,
+        /// Path to the directory with the results of a previous `run-all` execution
+        candidate_dir: PathBuf,
     },
 }
 
@@ -99,13 +105,16 @@ fn main() -> anyhow::Result<()> {
 
     let cli = Cli::parse();
     match cli.command {
-        Command::RunAll => {
+        Command::RunAll { output_dir } => {
             let executable = std::env::args().next().unwrap();
-            let results = run_all(executable, &benchmarks)?;
+            let output_dir = output_dir.unwrap_or("target/ci-bench".into());
+            let results = run_all(executable, output_dir.clone(), &benchmarks)?;
 
             // Output results in CSV (note: not using a library here to avoid extra dependencies)
+            let mut csv_file = File::create(output_dir.join(ICOUNTS_FILENAME))
+                .context("cannot create output csv file")?;
             for (name, instr_count) in results {
-                println!("{name},{instr_count}");
+                writeln!(csv_file, "{name},{instr_count}")?;
             }
         }
         Command::RunSingle { index, side } => {
@@ -167,12 +176,12 @@ fn main() -> anyhow::Result<()> {
             mem::forget(stdout);
         }
         Command::Compare {
-            baseline_input,
-            candidate_input,
+            baseline_dir,
+            candidate_dir,
         } => {
-            let baseline = read_results(baseline_input.as_ref())?;
-            let candidate = read_results(candidate_input.as_ref())?;
-            let result = compare_results(&baseline, &candidate);
+            let baseline = read_results(&baseline_dir.join(ICOUNTS_FILENAME))?;
+            let candidate = read_results(&candidate_dir.join(ICOUNTS_FILENAME))?;
+            let result = compare_results(&baseline_dir, &candidate_dir, &baseline, &candidate)?;
             print_report(&result);
 
             if !result.noteworthy.is_empty() {
@@ -275,9 +284,13 @@ fn add_benchmark_group(benchmarks: &mut Vec<Benchmark>, params: BenchmarkParams)
 }
 
 /// Run all the provided benches under cachegrind to retrieve their instruction count
-pub fn run_all(executable: String, benches: &[Benchmark]) -> anyhow::Result<Vec<(String, u64)>> {
+pub fn run_all(
+    executable: String,
+    output_dir: PathBuf,
+    benches: &[Benchmark],
+) -> anyhow::Result<Vec<(String, u64)>> {
     // Run the benchmarks in parallel
-    let cachegrind = CachegrindRunner::new(executable)?;
+    let cachegrind = CachegrindRunner::new(executable, output_dir)?;
     let results: Vec<_> = benches
         .par_iter()
         .enumerate()
@@ -497,8 +510,10 @@ fn run_bench<T: BenchStepper>(mut stepper: T, kind: BenchmarkKind) -> anyhow::Re
 
 /// The results of a comparison between two `run-all` executions
 struct CompareResult {
-    /// Results that probably indicate a real change in performance and should be highlighted
-    noteworthy: Vec<Diff>,
+    /// Results that probably indicate a real change in performance and should be highlighted.
+    ///
+    /// The string is a detailed diff between the instruction counts obtained from cachegrind.
+    noteworthy: Vec<(Diff, String)>,
     /// Results within the noise threshold
     negligible: Vec<Diff>,
     /// Benchmark scenarios present in the candidate but missing in the baseline
@@ -517,7 +532,10 @@ struct Diff {
 
 /// Reads the (benchmark, instruction count) pairs from previous CSV output
 fn read_results(path: &Path) -> anyhow::Result<HashMap<String, u64>> {
-    let file = fs::File::open(path).context("CSV file for comparison not found")?;
+    let file = File::open(path).context(format!(
+        "CSV file for comparison not found: {}",
+        path.display()
+    ))?;
 
     let mut measurements = HashMap::new();
     for line in BufReader::new(file).lines() {
@@ -543,9 +561,11 @@ fn read_results(path: &Path) -> anyhow::Result<HashMap<String, u64>> {
 /// Returns an internal representation of the comparison between the baseline and the candidate
 /// measurements
 fn compare_results(
+    baseline_dir: &Path,
+    candidate_dir: &Path,
     baseline: &HashMap<String, u64>,
     candidate: &HashMap<String, u64>,
-) -> CompareResult {
+) -> anyhow::Result<CompareResult> {
     let mut diffs = Vec::new();
     let mut missing = Vec::new();
     for (scenario, &instr_count) in candidate {
@@ -573,11 +593,18 @@ fn compare_results(
     });
 
     let (noteworthy, negligible) = split_on_threshold(&diffs);
-    CompareResult {
-        noteworthy,
+
+    let mut noteworthy_with_details = Vec::new();
+    for diff in noteworthy {
+        let detailed_diff = cachegrind::diff(baseline_dir, candidate_dir, &diff.scenario)?;
+        noteworthy_with_details.push((diff, detailed_diff));
+    }
+
+    Ok(CompareResult {
+        noteworthy: noteworthy_with_details,
         negligible,
         missing_in_baseline: missing,
-    }
+    })
 }
 
 /// Prints a report of the comparison to stdout, using GitHub-flavored markdown
@@ -594,23 +621,38 @@ fn print_report(result: &CompareResult) {
         }
     }
 
-    println!("### Noteworthy instruction count differences");
+    println!("## Noteworthy instruction count differences");
     if result.noteworthy.is_empty() {
         println!(
             "_There are no noteworthy instruction count differences (i.e. above {}%)_",
             CHANGE_THRESHOLD * 100.0
         );
     } else {
-        table(&result.noteworthy, true);
+        table(
+            result
+                .noteworthy
+                .iter()
+                .map(|(diff, _)| diff),
+            true,
+        );
+        println!("<details>");
+        println!("<summary>Details per scenario</summary>\n");
+        for (diff, detailed_diff) in &result.noteworthy {
+            println!("#### {}", diff.scenario);
+            println!("```");
+            println!("{detailed_diff}");
+            println!("```");
+        }
+        println!("</details>\n")
     }
 
-    println!("### Other instruction count differences");
+    println!("## Other instruction count differences");
     if result.negligible.is_empty() {
         println!("_There are no other instruction count differences_");
     } else {
         println!("<details>");
         println!("<summary>Click to expand</summary>\n");
-        table(&result.negligible, false);
+        table(result.negligible.iter(), false);
         println!("</details>\n")
     }
 }
@@ -633,7 +675,7 @@ fn split_on_threshold(diffs: &[Diff]) -> (Vec<Diff>, Vec<Diff>) {
 }
 
 /// Renders the diffs as a markdown table
-fn table(diffs: &[Diff], emoji_feedback: bool) {
+fn table<'a>(diffs: impl Iterator<Item = &'a Diff>, emoji_feedback: bool) {
     println!("| Scenario | Baseline | Candidate | Diff |");
     println!("| --- | ---: | ---: | ---: |");
     for diff in diffs {
author	Adolfo Ochagavía <github@adolfo.ochagavia.nl>	2023-09-05 11:25:36 +0200
committer	ctz <jpixton@gmail.com>	2023-09-05 13:34:27 +0000
commit	8c914559f8b31c2db642322a8c87b95ccfd9b2f3 (patch)
tree	abb76f1b91579ccb4709a152e80b6faf6de0b349 /ci-bench
parent	c9a397446231854c5af439081875ef53fe4339c8 (diff)