evobench_tools/stats_tables/stats/
mod.rs

1//! Type safe descriptive statistics
2//!
3//! Count, average, standard deviation, median and percentiles, with
4//! the number unit (`ViewType`) and tiles count verified in the type
5//! system, and ability to handle weighted values.
6
7pub mod average;
8pub mod weighted;
9
10use std::marker::PhantomData;
11use std::{borrow::Cow, str::FromStr};
12
13use anyhow::bail;
14use fixed::FixedU128;
15use fixed::traits::LossyFrom;
16use fixed::types::extra::U32;
17use num_traits::Zero;
18
19use super::{
20    stats::{
21        average::Average,
22        weighted::{IndexedNumbers, WeightedValue},
23    },
24    tables::table_view::{ColumnFormatting, Highlight, TableViewRow, Unit},
25};
26use crate::times::{MicroTime, NanoTime, ToStringMilliseconds};
27
28fn f64_from_fixed(f: FixedU128<U32>) -> f64 {
29    f.to_num()
30}
31
32fn u64_from_fixed(f: FixedU128<U32>) -> Option<u64> {
33    let x: FixedU128<U32> = f.round();
34    let n = u128::lossy_from(x);
35    n.try_into().ok()
36}
37
38fn absdiff(a: FixedU128<U32>, b: FixedU128<U32>) -> FixedU128<U32> {
39    if a > b { a - b } else { b - a }
40}
41
42fn square(x: FixedU128<U32>) -> FixedU128<U32> {
43    x * x
44}
45
46/// Selects a field of `Stats`, e.g. to calculate the stats for one of
47/// the Stats fields.
48#[derive(Debug, PartialEq, Clone, Copy)]
49pub enum StatsField<const TILE_COUNT: usize> {
50    N,
51    Sum,
52    Average,
53    Median,
54    SD,
55    Tile(u32),
56}
57
58impl<const TILE_COUNT: usize> FromStr for StatsField<TILE_COUNT> {
59    type Err = anyhow::Error;
60
61    fn from_str(s: &str) -> Result<Self, Self::Err> {
62        use StatsField::*;
63        match s {
64            "n" | "N" => Ok(N),
65            "sum" | "Sum" | "total" | "Total" => Ok(Sum),
66            "average" | "Average" | "avg" | "mean" => Ok(Average),
67            "median" | "Median" => Ok(Median),
68            "sd" | "SD" | "stdev" => Ok(SD),
69            _ => match f64::from_str(s) {
70                Ok(x) => {
71                    if x >= 0. && x <= 1.00000000001 {
72                        let i = (TILE_COUNT as f64 * x + 0.5).floor() as u32;
73                        Ok(Tile(i))
74                    } else {
75                        bail!(
76                            "expecting one of n|sum|average|median|sd or a floating \
77                             point number between 0 and 1, got: {x}"
78                        )
79                    }
80                }
81                Err(e) => bail!(
82                    "expecting one of n|sum|average|median|sd or a floating \
83                     point number between 0 and 1, floating point parse error for {s:?}: {e:#}"
84                ),
85            },
86        }
87    }
88}
89
90/// Convert a value to a string in a unit and corresponding
91/// representation suitable for the stats here.
92pub trait ToStatsString {
93    const UNIT_SHORT: &str;
94
95    fn to_stats_string(&self) -> String;
96}
97
98impl ToStatsString for MicroTime {
99    const UNIT_SHORT: &str = "ms";
100
101    fn to_stats_string(&self) -> String {
102        self.to_string_ms()
103    }
104}
105
106impl ToStatsString for NanoTime {
107    const UNIT_SHORT: &str = "ms";
108
109    fn to_stats_string(&self) -> String {
110        self.to_string_ms()
111    }
112}
113
114// XX define a Count ? Count<const &str>?
115impl ToStatsString for u64 {
116    const UNIT_SHORT: &str = "count";
117
118    fn to_stats_string(&self) -> String {
119        self.to_string()
120    }
121}
122
123/// `ViewType` is perhaps a bit of a misnomer: simply the type that
124/// the statistics is made for, e.g. `NanoTime`. But that type must,
125/// for full functionality, support conversion to and from u64, and
126/// `ToStatsString` for viewing.
127#[derive(Debug)]
128pub struct Stats<ViewType, const TILE_COUNT: usize> {
129    view_type: PhantomData<fn() -> ViewType>,
130    pub num_values: usize,
131    pub sum: u128,
132    pub average: FixedU128<U32>,
133    /// Interpolated and rounded up for even numbers of input values.
134    pub median: u64,
135    /// mean squared difference from the mean
136    pub variance: FixedU128<U32>,
137    /// Percentiles or in `TILE_COUNT` number of sections. Sample
138    /// count is the index, the sample value there is the value in the
139    /// vector. `tiles[0]` is the mininum, `tiles[TILE_COUNT]` the
140    /// maximum sample value. Cut-offs are inclusive, i.e. the index
141    /// is rounded up (shows the value of the next item if it falls at
142    /// least the distance 0.5 between items).
143    pub tiles: Vec<u64>,
144}
145
146#[derive(thiserror::Error, Debug)]
147pub enum StatsError {
148    #[error("no inputs given")]
149    NoInputs,
150    #[error("u128 saturated -- u64::MAX values summing up to near u128::MAX")]
151    SaturatedU128,
152    #[error("the virtual count (a u64) does not fit the usize range on this machine")]
153    VirtualCountDoesNotFitUSize,
154    #[error("the virtual sum does not fit the U96 range")]
155    VirtualSumDoesNotFitU96,
156}
157
158impl<ViewType, const TILE_COUNT: usize> Stats<ViewType, TILE_COUNT> {
159    /// Rounded
160    pub fn average_u64(&self) -> u64 {
161        u64_from_fixed(self.average).expect("average always fits its original number range")
162    }
163
164    pub fn variance_f64(&self) -> f64 {
165        f64_from_fixed(self.variance)
166    }
167
168    /// sqrt(variance) as u64, since our conversions to ms etc. are on
169    /// that type; bummer to lose f64 precision, though. Rounded.
170    pub fn standard_deviation_u64(&self) -> u64 {
171        // What about number overflows from f64 ? Can't happen,
172        // though, right?
173        (self.variance_f64().sqrt() + 0.5) as u64
174    }
175
176    /// Get the value for the given field (and ending up untyped! But
177    /// from_values was always untyped, too.)
178    #[inline]
179    pub fn get(&self, field: StatsField<TILE_COUNT>) -> u64 {
180        match field {
181            StatsField::N => self
182                .num_values
183                .try_into()
184                .expect("hopefully in range -- realistically it should be"),
185            StatsField::Sum => self
186                .sum
187                .try_into()
188                .expect("hopefully in range -- realistically it should be"),
189            StatsField::Average => self.average_u64(),
190            StatsField::Median => self.median,
191            StatsField::SD => self.standard_deviation_u64(),
192            StatsField::Tile(i) => {
193                self.tiles[usize::try_from(i).expect("u32 always works as index")]
194            }
195        }
196    }
197
198    // XXX add tests: compare to table_view_header's .0 field
199    pub fn column_of_field(field: StatsField<TILE_COUNT>) -> usize {
200        match field {
201            StatsField::N => 0,
202            StatsField::Sum => 1,
203            StatsField::Average => 2,
204            StatsField::Median => 3,
205            StatsField::SD => 4,
206            StatsField::Tile(i) => 6 + usize::try_from(i).expect("u32 always works as index"),
207        }
208    }
209
210    // If it's not count (u64), then it's ViewType. XXX add tests:
211    // compare to table_view_header's Unit field
212    pub fn field_type_is_count(field: StatsField<TILE_COUNT>) -> bool {
213        match field {
214            StatsField::N => true,
215            StatsField::Sum => false,
216            StatsField::Average => false,
217            StatsField::Median => false,
218            StatsField::SD => false,
219            StatsField::Tile(_) => false,
220        }
221    }
222
223    /// Make stats from values from field `field`: this determines the
224    /// ViewType of the resulting `Stats` struct: count or ViewType.
225    pub fn from_values_from_field(
226        field: StatsField<TILE_COUNT>,
227        vals: Vec<WeightedValue>,
228    ) -> Result<SubStats<ViewType, TILE_COUNT>, StatsError> {
229        if Self::field_type_is_count(field) {
230            Ok(SubStats::Count(Stats::from_values(vals)?))
231        } else {
232            Ok(SubStats::ViewType(Stats::from_values(vals)?))
233        }
234    }
235
236    /// `tiles_count` is how many 'tiles' to build, for percentiles
237    /// give the number 101. (Needs to own `vals` for sorting,
238    /// internally.)
239    pub fn from_values(mut vals: Vec<WeightedValue>) -> Result<Self, StatsError> {
240        {
241            let num_weights = vals.len();
242            if num_weights.is_zero() {
243                return Err(StatsError::NoInputs);
244            }
245        }
246
247        let (virtual_count, virtual_sum): (u64, u128) =
248            vals.iter()
249                .fold((0, 0), |(count, sum), WeightedValue { value, weight }| {
250                    let weight = u64::from(u32::from(*weight));
251                    (
252                        count + weight,
253                        sum + u128::from(*value) * u128::from(weight),
254                    )
255                });
256
257        let virtual_count_fixed = FixedU128::<U32>::from_num(virtual_count);
258
259        // Using fixed since the average can lie between steps; U32
260        // seems to be a good position (only guaranteed to work up to
261        // u32::MAX numbers, but practically some more, and allows to
262        // reflect averages in steps nearly up to the same numbers of
263        // numbers)
264        let average: FixedU128<U32> = {
265            // Disappointing that `fixed` does not appear to support a
266            // change in fractional bits during divisions? Maybe the
267            // still-not released version would do it?
268            let virtual_sum = FixedU128::<U32>::checked_from_num(virtual_sum)
269                .ok_or_else(|| StatsError::VirtualSumDoesNotFitU96)?;
270
271            virtual_sum / virtual_count_fixed
272        };
273
274        let variance = {
275            let sum_squared_error: FixedU128<U32> = vals
276                .iter()
277                .map(|WeightedValue { value, weight }| {
278                    // Do the same as if `value` would be copied `weight` times
279                    let value = FixedU128::<U32>::from_num(*value);
280                    let weight = FixedU128::<U32>::from_num(u32::from(*weight));
281
282                    square(absdiff(value, average)) * weight
283                })
284                .sum();
285            sum_squared_error / virtual_count_fixed
286        };
287
288        let indexed_vals = IndexedNumbers::from_unsorted_weighted_value_vec(&mut vals)
289            .expect("virtual_count is limited to u64 range above");
290        // Don't need vals any more, avoid accidental use
291        drop(vals);
292        assert_eq!(indexed_vals.virtual_len(), virtual_count);
293
294        // Calculate the median before making tiles, because for
295        // uneven lengths, tiles do not precisely contain the median.
296        let median = {
297            let mid = indexed_vals.virtual_len() / 2;
298            if 0 == indexed_vals.virtual_len() % 2 {
299                // len is checked to be > 0, so we
300                // must have at least 2 values here.
301                (indexed_vals[mid - 1], indexed_vals[mid]).average()
302            } else {
303                indexed_vals[mid]
304            }
305        };
306
307        // As an example, a TILE_COUNT of 3 means, we group into
308        // [min, med, max] (we get percentiles via a TILE_COUNT of
309        // 101, not 100, if we want a median bucket!). We need to go 2
310        // distances. If `vals` has 4 elements, that's 3 distances to
311        // go. The tile position 2 needs to go to vals index 3, the
312        // tile position 1 to vals index 1.5 -> round up to index 2.
313        //
314        // XX Why does this not do the averages any more?? Unlike
315        // median above.
316        let vals_distances = (virtual_count - 1) as f64;
317        let tiles_distances = (TILE_COUNT - 1) as f64;
318        let mut tiles = Vec::new();
319        for i in 0..TILE_COUNT {
320            let index = i as f64 / tiles_distances * vals_distances + 0.5;
321            let val = indexed_vals[index as u64];
322            tiles.push(val);
323        }
324
325        assert_eq!(indexed_vals.first(), tiles.first());
326        assert_eq!(indexed_vals.last(), tiles.last());
327
328        Ok(Stats {
329            view_type: PhantomData::default(),
330            num_values: usize::try_from(virtual_count)
331                .map_err(|_| StatsError::VirtualCountDoesNotFitUSize)?,
332            sum: virtual_sum,
333            average,
334            median,
335            variance,
336            tiles,
337        })
338    }
339}
340
341/// A way to give full access to `Stats` structs with the possible
342/// parameterizations, unlike the `dyn TableViewRow` approach which
343/// only gives access to that interface. Also avoid the need for
344/// boxing, perhaps performance relevant. (But then, somewhat
345/// funnily?, we need to implement `TableViewRow` for this, too.)
346pub enum SubStats<ViewType, const TILE_COUNT: usize> {
347    Count(Stats<u64, TILE_COUNT>),
348    ViewType(Stats<ViewType, TILE_COUNT>),
349}
350
351#[derive(Debug, Clone, Copy)]
352pub enum SubStatsKind {
353    Count,
354    ViewType,
355}
356
357impl<ViewType: From<u64> + ToStatsString, const TILE_COUNT: usize> TableViewRow<SubStatsKind>
358    for SubStats<ViewType, TILE_COUNT>
359{
360    fn table_view_header(
361        ctx: SubStatsKind,
362    ) -> Box<dyn AsRef<[(Cow<'static, str>, Unit, ColumnFormatting)]>> {
363        eprintln!("XX weirdly this is never used??");
364        match ctx {
365            SubStatsKind::Count => Stats::<u64, TILE_COUNT>::table_view_header(()),
366            SubStatsKind::ViewType => Stats::<ViewType, TILE_COUNT>::table_view_header(()),
367        }
368    }
369
370    fn table_view_row(&self, out: &mut Vec<(Cow<str>, Highlight)>) {
371        match self {
372            SubStats::Count(stats) => stats.table_view_row(out),
373            SubStats::ViewType(stats) => stats.table_view_row(out),
374        }
375    }
376}
377
378impl<ViewType: From<u64> + ToStatsString, const TILE_COUNT: usize> TableViewRow<()>
379    for Stats<ViewType, TILE_COUNT>
380{
381    fn table_view_header(_: ()) -> Box<dyn AsRef<[(Cow<'static, str>, Unit, ColumnFormatting)]>> {
382        let mut cols = vec![
383            ("n".into(), Unit::Count, ColumnFormatting::Number),
384            (
385                "sum".into(),
386                Unit::ViewType(ViewType::UNIT_SHORT),
387                ColumnFormatting::Number,
388            ),
389            (
390                "avg".into(),
391                Unit::ViewType(ViewType::UNIT_SHORT),
392                ColumnFormatting::Number,
393            ),
394            (
395                "median".into(),
396                Unit::ViewType(ViewType::UNIT_SHORT),
397                ColumnFormatting::Number,
398            ),
399            (
400                "SD".into(),
401                Unit::ViewType(ViewType::UNIT_SHORT),
402                ColumnFormatting::Number,
403            ),
404            ("".into(), Unit::None, ColumnFormatting::Spacer),
405        ];
406
407        for i in 0..TILE_COUNT {
408            cols.push((
409                format!("{:.2}", (i as f64) / ((TILE_COUNT - 1) as f64)).into(),
410                Unit::ViewType(ViewType::UNIT_SHORT),
411                ColumnFormatting::Number,
412            ));
413        }
414        Box::new(cols)
415    }
416
417    fn table_view_row(&self, out: &mut Vec<(Cow<str>, Highlight)>) {
418        let Self {
419            view_type: _,
420            num_values,
421            sum,
422            // using average_u64() instead
423            average: _,
424            // using standard_deviation_u64() instead
425            variance: _,
426            median,
427            tiles,
428        } = self;
429
430        out.push((num_values.to_string().into(), Highlight::Neutral));
431        out.push((
432            ViewType::from(u64::try_from(*sum).expect("sum must fit in u64 range"))
433                .to_stats_string()
434                .into(),
435            Highlight::Neutral,
436        ));
437        out.push((
438            ViewType::from(self.average_u64()).to_stats_string().into(),
439            Highlight::Neutral,
440        ));
441        out.push((
442            ViewType::from(*median).to_stats_string().into(),
443            Highlight::Neutral,
444        ));
445        out.push((
446            // bummer, float precision is lost here, but doesn't
447            // matter in our ns or us units
448            ViewType::from(self.standard_deviation_u64())
449                .to_stats_string()
450                .into(),
451            Highlight::Neutral,
452        ));
453
454        out.push(("".into(), Highlight::Spacer));
455
456        for val in tiles {
457            out.push((
458                ViewType::from(*val).to_stats_string().into(),
459                Highlight::Neutral,
460            ));
461        }
462    }
463}
464
465#[cfg(test)]
466mod tests {
467    use anyhow::Result;
468
469    use crate::stats_tables::stats::weighted::WEIGHT_ONE;
470
471    use super::*;
472
473    fn weighted(vals: &[u64]) -> Vec<WeightedValue> {
474        vals.iter()
475            .copied()
476            .map(|value| WeightedValue {
477                value,
478                weight: WEIGHT_ONE,
479            })
480            .collect()
481    }
482
483    #[test]
484    fn t_average_and_tiles_and_median() -> Result<()> {
485        let data = weighted(&[23, 4, 8, 30, 7]);
486        let stats = Stats::<u64, 4>::from_values(data)?;
487        assert_eq!(stats.average_u64(), 14); // 14.4
488        assert_eq!((stats.average * 10).round(), 144); // 14.4
489        assert_eq!(stats.tiles, [4, 7, 23, 30]); // 8 skipped
490        assert_eq!(stats.median, 8);
491        assert_eq!((stats.variance * 100).round(), 10424); // 104.24
492        assert_eq!(stats.standard_deviation_u64(), 10); // 10.2097992144802
493
494        let data = weighted(&[23, 4, 8, 31, 7]);
495        let stats = Stats::<u64, 4>::from_values(data)?;
496        assert_eq!(stats.average_u64(), 15); // 14.6
497        assert_eq!((stats.average * 10).round(), 146); // 14.6
498        assert_eq!(stats.tiles[0], 4);
499        assert_eq!(stats.tiles.len(), 4);
500        assert_eq!(stats.tiles[3], 31);
501        assert_eq!(stats.tiles, [4, 7, 23, 31]); // 8 skipped
502        assert_eq!(stats.median, 8);
503        assert_eq!((stats.variance * 100).round(), 11064); // 110.64
504        assert_eq!(stats.standard_deviation_u64(), 11); // 10.5185550338438
505
506        let data = weighted(&[23, 4, 8, 7]);
507        let stats = Stats::<u64, 4>::from_values(data)?;
508        assert_eq!(stats.average_u64(), 11); // 10.5
509        assert_eq!(stats.average, 10.5);
510        assert_eq!(stats.tiles, [4, 7, 8, 23]);
511        assert_eq!(stats.median, 8); // 7.5 rounded up ? XX
512
513        let data = weighted(&[23, 8, 7]);
514        let stats = Stats::<u64, 4>::from_values(data)?;
515        assert_eq!(stats.average_u64(), 13); // 12.6666666666667
516        assert_eq!(stats.average.to_num::<f64>(), 12.666666666511446); // 12.6666666666667
517        assert_eq!(stats.median, 8);
518        assert_eq!(stats.tiles, [7, 8, 8, 23]);
519
520        Ok(())
521    }
522
523    #[test]
524    fn t_median() -> Result<()> {
525        let data = weighted(&[23, 4, 8, 7]);
526        let stats = Stats::<u64, 4>::from_values(data)?;
527        assert_eq!(stats.median, 8); // 7.5 rounded up
528
529        let data = weighted(&[23, 4, 9, 7]);
530        let stats = Stats::<u64, 4>::from_values(data)?;
531        assert_eq!(stats.median, 8); // 8.0
532
533        let data = weighted(&[23, 4, 10, 7]);
534        let stats = Stats::<u64, 4>::from_values(data)?;
535        assert_eq!(stats.median, 9); // 8.5
536
537        let data = weighted(&[23, 4, 10, 7]);
538        let stats = Stats::<u64, 2>::from_values(data)?;
539        assert_eq!(stats.tiles, [4, 23]);
540        // Calculated from original values, not tiles:
541        assert_eq!(stats.median, 9); // 8.5
542
543        let data = weighted(&[23, 4, 7]);
544        let stats = Stats::<u64, 2>::from_values(data)?;
545        assert_eq!(stats.median, 7);
546
547        let data = weighted(&[23, 4, 7]);
548        let stats = Stats::<u64, 3>::from_values(data)?;
549        assert_eq!(stats.median, 7);
550
551        let data = weighted(&[23, 4, 7]);
552        let stats = Stats::<u64, 4>::from_values(data)?;
553        assert_eq!(stats.median, 7);
554
555        let data = weighted(&[23, 4]);
556        let stats = Stats::<u64, 4>::from_values(data)?;
557        assert_eq!(stats.median, 14); // 13.5
558
559        Ok(())
560    }
561
562    #[test]
563    fn t_weights() -> Result<()> {
564        let data = weighted(&[23, 4, 9, 4, 4, 7]); // 4 4 4 7 9 23
565        let stats = Stats::<u64, 4>::from_values(data)?;
566        assert_eq!(stats.average, 8.5);
567        assert_eq!(stats.median, 6); // 5.5
568        assert_eq!(stats.variance.to_num::<f64>(), 45.58333333325572); // 45.5833333333
569
570        let mut data = weighted(&[23, 4, 9, 7]);
571        data.push(WeightedValue {
572            value: 4,
573            weight: 2.try_into().unwrap(),
574        });
575        let stats = Stats::<u64, 4>::from_values(data)?;
576        assert_eq!(stats.median, 6); // 5.5
577        assert_eq!(stats.variance.to_num::<f64>(), 45.58333333325572); // 45.5833333333333
578
579        Ok(())
580    }
581}