evobench_tools/
git.rs

1//! Retrieve a history DAG from Git
2//!
3//! For tracking performance changes across the history. Should
4//! perhaps be moved to the `run-git` crate.
5
6use std::{
7    collections::{BTreeSet, HashMap, HashSet},
8    fmt::{Debug, Display},
9    ops::Index,
10    path::Path,
11    process::{Command, Stdio},
12    sync::{Arc, Mutex},
13};
14
15use anyhow::{Context, Result, anyhow, bail};
16use itertools::Itertools;
17use kstring::KString;
18use smallvec::SmallVec;
19
20use crate::date_and_time::unixtime::Unixtime;
21pub use crate::serde_types::git_hash::GitHash;
22
23#[derive(Debug)]
24pub struct GitCommit<RefType> {
25    pub commit_hash: GitHash,
26    pub author_time: Unixtime,
27    pub committer_time: Unixtime,
28    pub parents: SmallVec<[RefType; 1]>,
29}
30
31#[derive(Debug)]
32pub struct EnrichedGitCommit<RefType> {
33    pub commit: GitCommit<RefType>,
34    /// The length of the longest parent chain (i.e. 0 for the initial
35    /// commit)
36    pub depth: usize,
37}
38
39impl GitCommit<Id<ToEnrichedCommit>> {
40    /// Turn the `Id`s to commit hashes. Panics if the contained ids
41    /// are not in `data`!
42    pub fn with_ids_as_hashes(&self, data: &GitGraphData) -> GitCommit<GitHash> {
43        let Self {
44            commit_hash,
45            author_time,
46            committer_time,
47            parents,
48        } = self;
49
50        let parents = parents
51            .iter()
52            .map(|parent| data[*parent].commit.commit_hash.clone())
53            .collect();
54
55        GitCommit {
56            commit_hash: commit_hash.clone(),
57            author_time: *author_time,
58            committer_time: *committer_time,
59            parents,
60        }
61    }
62}
63
64impl EnrichedGitCommit<Id<ToEnrichedCommit>> {
65    /// Turn the `Id`s to commit hashes. Panics if the contained ids
66    /// are not in `data`!
67    pub fn with_ids_as_hashes(&self, data: &GitGraphData) -> EnrichedGitCommit<GitHash> {
68        let Self { commit, depth } = self;
69        let commit = commit.with_ids_as_hashes(data);
70        EnrichedGitCommit {
71            commit,
72            depth: *depth,
73        }
74    }
75}
76
77// XX arbitrarily, was meant just for testing (same output as original
78// git log output)
79impl Display for GitCommit<GitHash> {
80    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
81        let Self {
82            commit_hash,
83            author_time,
84            committer_time,
85            parents,
86        } = self;
87        let a = author_time.0;
88        let c = committer_time.0;
89        let parents = parents.iter().map(|p| p.to_string()).join(" ");
90        write!(f, "{commit_hash},{a},{c},{parents}")
91    }
92}
93
94// XX arbitrarily
95impl Display for EnrichedGitCommit<GitHash> {
96    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
97        let Self { commit, depth } = self;
98        write!(f, "{depth}\t{commit}")
99    }
100}
101
102#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
103pub struct Id<Kind>(u32, Kind);
104
105#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
106pub struct ToEnrichedCommit;
107
108// Consciously does not include a repository base path, to allow to
109// collect graph data from multiple of them? Is this sensible or not
110// really?
111#[derive(Debug)]
112pub struct GitGraphData {
113    by_hash: HashMap<GitHash, Id<ToEnrichedCommit>>,
114    ecommits: Vec<EnrichedGitCommit<Id<ToEnrichedCommit>>>,
115}
116
117#[derive(thiserror::Error, Debug)]
118#[error("more than u32::max commits")]
119pub struct MoreThanU32Commits;
120
121impl Id<ToEnrichedCommit> {
122    pub fn get(self, data: &GitGraphData) -> Option<&EnrichedGitCommit<Id<ToEnrichedCommit>>> {
123        data.get(self)
124    }
125}
126
127impl GitGraphData {
128    fn new() -> Self {
129        Self {
130            by_hash: HashMap::new(),
131            ecommits: Vec::new(),
132        }
133    }
134
135    pub fn get(
136        &self,
137        id: Id<ToEnrichedCommit>,
138    ) -> Option<&EnrichedGitCommit<Id<ToEnrichedCommit>>> {
139        self.ecommits
140            .get(usize::try_from(id.0).expect("at least 32 bit platform"))
141    }
142
143    pub fn get_by_hash(&self, h: &GitHash) -> Option<Id<ToEnrichedCommit>> {
144        self.by_hash.get(h).copied()
145    }
146
147    // Does not check whether commit is already contained! But *does*
148    // check that the parent ids are in range (they are referenced to
149    // calculate the depth).
150    pub fn unchecked_push(
151        &mut self,
152        commit: GitCommit<Id<ToEnrichedCommit>>,
153    ) -> Result<Id<ToEnrichedCommit>, MoreThanU32Commits> {
154        let commit_hash = commit.commit_hash.clone();
155
156        let depth = commit
157            .parents
158            .iter()
159            .map(|parent_id| self.get(*parent_id).expect("XXX").depth)
160            .max()
161            .map(|m| m + 1)
162            .unwrap_or(0);
163
164        let ecommit = EnrichedGitCommit { commit, depth };
165        let id = self.ecommits.len();
166        let id = Id(
167            u32::try_from(id).map_err(|_| MoreThanU32Commits)?,
168            ToEnrichedCommit,
169        );
170        self.by_hash.insert(commit_hash, id);
171        self.ecommits.push(ecommit);
172        Ok(id)
173    }
174
175    pub fn history_as_btreeset_from(
176        &self,
177        mut id: Id<ToEnrichedCommit>,
178    ) -> BTreeSet<Id<ToEnrichedCommit>> {
179        let mut ids = BTreeSet::new();
180        let mut stack_of_commits_to_follow = Vec::new();
181        loop {
182            if ids.insert(id) {
183                let ecommit = &self[id];
184                let mut parents = ecommit.commit.parents.iter();
185                if let Some(first_parent) = parents.next() {
186                    id = *first_parent;
187                    for parent in parents {
188                        stack_of_commits_to_follow.push(*parent);
189                    }
190                    continue;
191                }
192            }
193            if let Some(to_follow) = stack_of_commits_to_follow.pop() {
194                id = to_follow;
195            } else {
196                break;
197            }
198        }
199        ids
200    }
201
202    /// Find the ancestor commit that is closest to `for_id`; "close"
203    /// means least steps as long as on a branch; when there are
204    /// multiple branches (bevore a merge) the commit with the newer
205    /// commit time is chosen. It is guaranteed that the id passed to
206    /// `is_match` is contained in `self`.
207    pub fn closest_matching_ancestor_of(
208        &self,
209        for_id: Id<ToEnrichedCommit>,
210        is_match: impl Fn(Id<ToEnrichedCommit>) -> bool,
211    ) -> Result<Option<Id<ToEnrichedCommit>>> {
212        // XX Could use roaring bitmaps instead of HashSet.
213
214        // To detect fork points that were already visited.
215        let mut seen_commit_ids = HashSet::new();
216        seen_commit_ids.insert(for_id);
217
218        // Can't use Vec since we'd need to splice. Linked list or:
219        let mut current_commits = HashSet::new();
220        self.get(for_id)
221            .ok_or_else(|| anyhow!("invalid parent_id"))?;
222        current_commits.insert(for_id);
223
224        let get_id = |id: Id<ToEnrichedCommit>| self.get(id).expect("internal consistency");
225
226        while !current_commits.is_empty() {
227            // Of all the current commits that match the filter,
228            // return the newest one, if any.
229            if let Some(matching_id) = current_commits
230                .iter()
231                .copied()
232                .filter(|id| is_match(*id))
233                .max_by_key(|id| {
234                    let commit = get_id(*id);
235                    commit.commit.committer_time
236                })
237            {
238                return Ok(Some(matching_id));
239            }
240            // Of all the current commits follow the newest one back
241            // to its parents.
242            let commit_id_to_follow = current_commits
243                .iter()
244                .copied()
245                .max_by_key(|id| get_id(*id).commit.committer_time)
246                .expect("exiting before here when current_commits is empty");
247            current_commits.remove(&commit_id_to_follow);
248            let commit_to_follow = get_id(commit_id_to_follow);
249            for commit_id in &commit_to_follow.commit.parents {
250                if !seen_commit_ids.contains(commit_id) {
251                    current_commits.insert(*commit_id);
252                    seen_commit_ids.insert(*commit_id);
253                }
254            }
255        }
256        Ok(None)
257    }
258
259    pub fn sorted_by<T: Ord>(
260        &self,
261        commits: &BTreeSet<Id<ToEnrichedCommit>>,
262        mut by: impl FnMut(&EnrichedGitCommit<Id<ToEnrichedCommit>>) -> T,
263    ) -> Vec<Id<ToEnrichedCommit>> {
264        let mut vec: Vec<_> = commits.iter().copied().collect();
265        vec.sort_by_key(|id| by(&self[*id]));
266        vec
267    }
268
269    pub fn ids_as_commits<'s: 'ids, 'ids>(
270        &'s self,
271        ids: &'ids [Id<ToEnrichedCommit>],
272    ) -> impl DoubleEndedIterator<Item = &'s EnrichedGitCommit<Id<ToEnrichedCommit>>>
273    + ExactSizeIterator
274    + use<'s, 'ids> {
275        ids.iter().map(|id| &self[*id])
276    }
277
278    pub fn commits(&self) -> &[EnrichedGitCommit<Id<ToEnrichedCommit>>] {
279        &self.ecommits
280    }
281
282    pub fn add_history_from_dir_ref(
283        &mut self,
284        in_directory: impl AsRef<Path>,
285        entry_reference: &str,
286    ) -> Result<GitEntrypoint> {
287        let in_directory = in_directory.as_ref();
288        // XX first check if entry_reference is already indexed? But
289        // need name index, too, then. And should make directory part
290        // of the context, here, really.
291        let commits = git_log_commits(in_directory, entry_reference)?;
292        if commits.is_empty() {
293            bail!("invalid Git reference {entry_reference:?} in Git dir {in_directory:?}")
294        }
295        Ok(GitEntrypoint::from_commits(
296            KString::from_ref(entry_reference),
297            commits.iter().rev(),
298            self,
299        )?)
300    }
301}
302
303impl Index<Id<ToEnrichedCommit>> for GitGraphData {
304    type Output = EnrichedGitCommit<Id<ToEnrichedCommit>>;
305
306    fn index(&self, index: Id<ToEnrichedCommit>) -> &Self::Output {
307        &self.ecommits[usize::try_from(index.0).expect("usize must be at least 32 bit wide")]
308    }
309}
310
311#[derive(Debug)]
312pub struct GitGraph(Mutex<GitGraphData>);
313
314impl GitGraph {
315    pub fn new() -> Arc<Self> {
316        Self(Mutex::new(GitGraphData::new())).into()
317    }
318
319    pub fn lock(&self) -> std::sync::MutexGuard<'_, GitGraphData> {
320        match self.0.lock() {
321            Ok(l) => l,
322            // Lock poisoning should never be able to hurt us (XX?),
323            // thus just recover
324            Err(l) => l.into_inner(),
325        }
326    }
327}
328
329#[derive(Debug)]
330pub struct GitEntrypoint {
331    pub name: KString,
332    pub commit_id: Id<ToEnrichedCommit>,
333}
334
335impl GitEntrypoint {
336    /// `commits` must come in order of creation, i.e. parents must
337    /// come before children, or this panics! Also panics if commits
338    /// is empty!
339    pub fn from_commits<'c>(
340        entry_reference: KString,
341        commits: impl Iterator<Item = &'c GitCommit<GitHash>>,
342        graph_lock: &mut GitGraphData,
343    ) -> Result<Self, MoreThanU32Commits> {
344        let mut entry_commit_id: Option<Id<ToEnrichedCommit>> = None;
345        for GitCommit {
346            commit_hash,
347            author_time,
348            committer_time,
349            parents,
350        } in commits
351        {
352            // debug!("processing commit {commit_hash}");
353            if let Some(id) = graph_lock.get_by_hash(commit_hash) {
354                // debug!("already recorded {commit_hash} earlier, ignoring the rest");
355                // But need to continue, as there can be other merged
356                // branches that come up later in the git log output.
357                entry_commit_id = Some(id);
358            } else {
359                let commit = GitCommit {
360                    commit_hash: commit_hash.clone(),
361                    author_time: *author_time,
362                    committer_time: *committer_time,
363                    parents: parents
364                        .iter()
365                        .map(|parent| {
366                            graph_lock.get_by_hash(parent).unwrap_or_else(|| {
367                                panic!(
368                                    "can't find parent {parent} of commit {commit_hash} -- \
369                                     need commits with the oldest first!"
370                                )
371                            })
372                        })
373                        .collect(),
374                };
375                entry_commit_id = Some(graph_lock.unchecked_push(commit)?);
376            }
377        }
378        Ok(GitEntrypoint {
379            name: entry_reference,
380            commit_id: entry_commit_id.expect("to be given non-empty commits iterator"),
381        })
382    }
383}
384
385/// Returns the commits with the newest one first. Careful:
386/// `GitHistory::from_commits` expects them in the reverse order of
387/// this one.  This returns a Vec (for lifetime reasons but also)
388/// because it needs to be reversed afterwards, but also because
389/// following branched Git history (via git log) can find branch with
390/// known commits at some point, but the other still needing
391/// exploration. Would need to analyze the history on the go to know
392/// if stopping is OK. Thus for now, just get the whole
393/// history. Returns the empty vector if the given reference does not
394/// resolve! You usually do not want to use this function directly,
395/// but instead initialize a GitGraph, get the lock, then run
396/// `add_history_from_dir_ref` on it, which then uses this function.
397pub fn git_log_commits(
398    in_directory: impl AsRef<Path>,
399    entry_reference: &str,
400) -> Result<Vec<GitCommit<GitHash>>> {
401    let in_directory = in_directory.as_ref();
402    let mut c = Command::new("git");
403
404    c.args(&[
405        "log",
406        &{
407            let commit_hash = "%H";
408            let author_time = "%at";
409            let committer_time = "%ct";
410            let parent_hashes = "%P";
411            format!("--pretty={commit_hash},{author_time},{committer_time},{parent_hashes}")
412        },
413        entry_reference,
414    ]);
415    let str_from_bytes =
416        |bs| std::str::from_utf8(bs).expect("git always gives ascii with given arguments");
417    c.current_dir(in_directory);
418
419    c.stdout(Stdio::piped());
420    // stderr, too, but it's the case by default, anyway!
421    let output = c
422        .output()
423        .with_context(|| anyhow!("in directory {in_directory:?}"))?;
424    if !output.status.success() {
425        // And I have such code already (in run-git, right?)
426        let err = String::from_utf8_lossy(&output.stderr);
427        bail!("failure in git directory {in_directory:?}: {err}",)
428    }
429    output
430        .stdout
431        .split(|b| *b == b'\n')
432        .filter(|line| !line.is_empty())
433        .map(|line| -> Result<GitCommit<GitHash>> {
434            let items: SmallVec<[_; 4]> = line.split(|b| *b == b',').collect();
435            if let [commit_hash, author_time, committer_time, parents] = items.as_slice() {
436                let commit_hash = GitHash::try_from(*commit_hash)?;
437                let author_time = Unixtime(u64::from_str_radix(str_from_bytes(author_time), 10)?);
438                let committer_time =
439                    Unixtime(u64::from_str_radix(str_from_bytes(committer_time), 10)?);
440                let parents: SmallVec<_> = parents
441                    .split(|b| *b == b' ')
442                    .into_iter()
443                    .filter(|bs| !bs.is_empty())
444                    .map(GitHash::try_from)
445                    .collect::<Result<_>>()?;
446                Ok(GitCommit {
447                    commit_hash,
448                    author_time,
449                    committer_time,
450                    parents,
451                })
452            } else {
453                unreachable!("4 fields from git")
454            }
455        })
456        .collect()
457}
458
459#[cfg(test)]
460mod tests {
461    use super::*;
462
463    #[test]
464    fn t_() -> Result<()> {
465        let git_in_cwd = AsRef::<Path>::as_ref("../.git");
466        if !git_in_cwd.is_dir() {
467            eprintln!("git.rs: not tested due to not being in a Git repository");
468            return Ok(());
469        }
470        let graph = GitGraph::new();
471        let mut graph_guard = graph.lock();
472
473        let interesting_commit_ids = [
474            // a later commit:
475            "710e7a2f4d48124964efbe48c901259ae31cfd6a",
476            // the merge commit:
477            "f73da5abcc389db7754715a9fecadb478ecfbc16",
478            // ancestors in two separate branches leading to the commit above:
479            "9fd0ad621328a11a984aaa0700d54d05af6a899a",
480            "d165351476db2d65c3efcda89b4a99decede3784",
481        ];
482
483        // Test depth:
484
485        let refs = interesting_commit_ids.map(|name| {
486            graph_guard
487                .add_history_from_dir_ref(git_in_cwd, name)
488                .map_err(|e| e.to_string())
489        });
490
491        dbg!(&refs);
492
493        assert_eq!(
494            refs[2].as_ref().unwrap().name.as_ref(),
495            "9fd0ad621328a11a984aaa0700d54d05af6a899a"
496        );
497
498        let ids: [Id<ToEnrichedCommit>; _] = refs.map(|res| res.unwrap().commit_id);
499
500        let depths = ids.map(|id| id.get(&graph_guard).unwrap().depth);
501        assert_eq!(depths, [166, 163, 159, 161]);
502
503        // Test closest_matching_ancestor_of:
504
505        let closest = graph_guard
506            .closest_matching_ancestor_of(ids[0], |id| (&ids[2..]).contains(&id))?
507            .expect("to find it");
508        assert_eq!(closest, Id(165, ToEnrichedCommit));
509        assert_eq!(
510            closest
511                .get(&graph_guard)
512                .expect("contained")
513                .commit
514                .commit_hash
515                .to_string(),
516            "9fd0ad621328a11a984aaa0700d54d05af6a899a"
517        );
518
519        Ok(())
520    }
521}