chj_rustbin/
duu.rs

1use std::{
2    collections::{hash_map::Entry, HashMap},
3    ffi::OsString,
4    os::unix::fs::MetadataExt,
5    path::PathBuf,
6    sync::Mutex,
7};
8
9use anyhow::{anyhow, Context, Result};
10
11use crate::io::file_path_type::FileType;
12
13/// In GiB fixed with 3 decimal places. Pads to 999.999 GiB.
14pub fn bytes_to_gib_string(bytes: u64) -> String {
15    let mb_times_1000 = (bytes / 1024 + 512) * 1000 / 1024;
16    let gb_times_1000 = mb_times_1000 / 1024;
17    let digits = gb_times_1000.to_string();
18    let len = digits.len();
19    if len <= 3 {
20        format!("  0.{gb_times_1000:03}")
21    } else {
22        let hi = &digits[0..len - 3];
23        let lo = &digits[len - 3..];
24        format!("{hi:>3}.{lo}")
25    }
26}
27
28#[test]
29fn t_bytes_to_gib_string() {
30    let t = bytes_to_gib_string;
31    assert_eq!(t(0), "  0.000");
32    assert_eq!(t(20000), "  0.000");
33    assert_eq!(t(500000), "  0.000");
34    assert_eq!(t(600000), "  0.001");
35    assert_eq!(t(1024 * 1024 * 1024), "  1.000");
36    assert_eq!(t(512 * 1024 * 1024), "  0.500");
37    assert_eq!(t(900 * 1024 * 1024 * 1024), "900.000");
38}
39
40// Bytes as KB, rounded up
41pub fn bytes_to_kb(bytes: u64) -> u64 {
42    (bytes + 1023) / 1024
43}
44
45pub fn to_human_readable(
46    powers: u64,
47    si: bool,
48    // bytes
49    mut val: u64,
50) -> (u64, &'static str) {
51    let mut n = 0;
52    // What multiplier to use? Ah, `du` actually uses fractional
53    // format below some value, like "7.6G"
54    const MULTIPLIER: u64 = 10;
55    loop {
56        let val2 = (val + powers / 2) / powers;
57        if val2 > MULTIPLIER {
58            val = val2;
59            n += 1;
60        } else {
61            break;
62        }
63    }
64    let unit = match n {
65        0 => "",
66        1 => {
67            if si {
68                "k"
69            } else {
70                "K"
71            }
72        }
73        2 => "M",
74        3 => "G",
75        4 => "T",
76        5 => "P",
77        6 => "E",
78        7 => "Z",
79        8 => "Y",
80        9 => "R",
81        10 => "Q",
82        // doesn't fit into 64-bit anyway
83        _ => unreachable!("number too large, don't have a prefix"),
84    };
85    (val, unit)
86}
87
88// `man 3type stat`: st_blocks: Number of 512 B blocks allocated
89const BLOCKSIZE: u64 = 512;
90
91pub struct ItemError {
92    pub file_type: FileType,
93    pub file_name: OsString,
94    pub error: String,
95}
96
97/// Disk usage of the contents of a particular directory
98pub struct DirDiskUsage {
99    /// Path to this directory
100    pub path: PathBuf,
101    /// Files directly inside this directory, including subdirectories
102    /// themselves (excluding their contents)
103    pub file_bytes: u64,
104    /// Files with a link count > 1 are recorded here, and added after
105    /// finishing the scan, when the actual share count is known, so
106    /// that their disk usage can be split across the usage sites
107    pub shared_files: Vec<InodeKey>,
108    /// Reflecting the *contents* of subdirectories
109    pub subdirs: Vec<Result<DirDiskUsage>>,
110    /// Errors while processing the items directly inside this
111    /// directory (errors processing subdirectories are kept in
112    /// `subdirs` instead)
113    pub errors: Vec<ItemError>,
114}
115
116impl DirDiskUsage {
117    /// in bytes
118    pub fn total_files(
119        &self,
120        shared_inodes: &HashMap<InodeKey, InodeData>,
121    ) -> u64 {
122        self.file_bytes
123            + self
124                .shared_files
125                .iter()
126                .map(|inode_key| {
127                    shared_inodes.get(inode_key).expect(
128                        "given correct shared_inodes table, entries are always present"
129                    ).bytes_share_rounded()
130                })
131                .sum::<u64>()
132    }
133
134    /// in bytes
135    pub fn total_subdirs(
136        &self,
137        shared_inodes: &HashMap<InodeKey, InodeData>,
138    ) -> u64 {
139        self.subdirs
140            .iter()
141            .map(|result| -> u64 {
142                match result {
143                    Ok(du) => du.total(shared_inodes),
144                    Err(_) => 0,
145                }
146            })
147            .sum()
148    }
149
150    /// total in bytes
151    pub fn total(&self, shared_inodes: &HashMap<InodeKey, InodeData>) -> u64 {
152        self.total_files(shared_inodes) + self.total_subdirs(shared_inodes)
153    }
154
155    /// Collect all errors (of all kinds) of this tree into `out`
156    pub fn get_errors(&self, limit: usize, out: &mut Vec<String>) {
157        for ItemError {
158            file_type,
159            file_name,
160            error,
161        } in &self.errors
162        {
163            if out.len() >= limit {
164                return;
165            }
166            out.push(format!(
167                "{file_type:?} item {file_name:?} in {:?}: {error:#}",
168                self.path
169            ));
170        }
171        for subdir in &self.subdirs {
172            if out.len() >= limit {
173                return;
174            }
175            match subdir {
176                Ok(du) => du.get_errors(limit, out),
177                Err(error) => {
178                    out.push(format!("{error:#}",));
179                }
180            }
181        }
182    }
183}
184
185#[derive(Clone, PartialEq, Eq, Hash)]
186pub struct InodeKey {
187    pub dev: u64,
188    pub inode: u64,
189}
190
191pub struct InodeData {
192    /// The usual blocks * blocksize of the inode's storage (as per
193    /// stat)
194    pub bytes: u64,
195    /// This is not the inode count (number of times an inode is used
196    /// in the file system), but only the number of times this inode
197    /// is seen in the file system tree we're looking at. This is used
198    /// when `share_globally` is *not* true.
199    pub share_count: u64,
200}
201
202impl InodeData {
203    /// The number of bytes to consider for each usage site
204    pub fn bytes_share_rounded(&self) -> u64 {
205        (self.bytes + (self.share_count + 1) / 2) / self.share_count
206    }
207}
208
209#[test]
210fn t_bytes_share_rounded() {
211    let t = |bytes, share_count| {
212        InodeData { share_count, bytes }.bytes_share_rounded()
213    };
214    assert_eq!(t(5, 3), 2);
215    assert_eq!(t(5, 4), 1);
216    assert_eq!(t(6, 3), 2);
217    assert_eq!(t(6, 2), 3);
218    assert_eq!(t(6, 4), 2);
219    assert_eq!(t(6, 5), 1);
220}
221
222pub struct GetDirDiskUsage {
223    pub one_file_system: bool,
224    pub share_globally: bool,
225    pub shared_inodes: Mutex<HashMap<InodeKey, InodeData>>,
226}
227
228impl GetDirDiskUsage {
229    pub fn dir_disk_usage(
230        &self,
231        path: PathBuf,
232        current_dev: u64,
233    ) -> Result<DirDiskUsage> {
234        let items = std::fs::read_dir(&path)
235            .with_context(|| anyhow!("opening directory {path:?}"))?;
236        let mut file_bytes = 0;
237        let mut errors = vec![];
238        let mut shared_files = vec![];
239        let subdirs = Mutex::new(vec![]);
240        rayon::scope(|scope| -> Result<()> {
241            for item in items {
242                let item =
243                    item.with_context(|| anyhow!("reading items in {path:?}"))?;
244                let file_name = item.file_name();
245                match item.metadata() {
246                    Ok(metadata) => {
247                        /* Number of 512 B blocks allocated */
248                        let blocks = metadata.blocks();
249                        let blocksize = BLOCKSIZE; // *not* s.blksize()!
250
251                        let mut inc_file_bytes = || {
252                            file_bytes += blocks * blocksize;
253                        };
254
255                        if metadata.is_dir() {
256                            let new_dev = metadata.dev();
257
258                            if (!self.one_file_system) || new_dev == current_dev
259                            {
260                                // Include counting the dir too (the
261                                // 'shell' only; and only if
262                                // recursing, since otherwise the
263                                // space use is on the other file
264                                // system):
265                                inc_file_bytes();
266
267                                // recurse
268                                let mut path = path.clone();
269                                path.push(&file_name);
270                                let subdirs = &subdirs;
271                                scope.spawn(move |_| {
272                                    let result =
273                                        self.dir_disk_usage(path, new_dev);
274                                    subdirs
275                                        .lock()
276                                        .expect("no crash")
277                                        .push(result);
278                                });
279                            }
280                        } else {
281                            let nlink = metadata.nlink();
282                            if nlink > 1 && blocks > 0 {
283                                if self.share_globally {
284                                    file_bytes += (blocks * blocksize
285                                        + (nlink + 1) / 2)
286                                        / nlink;
287                                } else {
288                                    let key = InodeKey {
289                                        dev: metadata.dev(),
290                                        inode: metadata.ino(),
291                                    };
292
293                                    shared_files.push(key.clone());
294
295                                    let mut shared = self
296                                        .shared_inodes
297                                        .lock()
298                                        .expect("no crash");
299                                    match shared.entry(key) {
300                                        Entry::Occupied(mut o) => {
301                                            let data = o.get_mut();
302                                            data.share_count += 1;
303                                        }
304                                        Entry::Vacant(v) => {
305                                            v.insert(InodeData {
306                                                share_count: 1,
307                                                bytes: blocks * blocksize,
308                                            });
309                                        }
310                                    }
311                                }
312                            } else {
313                                inc_file_bytes()
314                            }
315                        }
316                    }
317                    Err(e) => {
318                        // This call should never fail on Linux?
319                        let file_type: FileType = (&item.file_type()?).into();
320                        errors.push(ItemError {
321                            file_type,
322                            file_name,
323                            error: format!("{e:#}"),
324                        });
325                    }
326                }
327            }
328            Ok(())
329        })?;
330
331        let subdirs = subdirs.into_inner().expect("no crash either");
332
333        Ok(DirDiskUsage {
334            path,
335            file_bytes,
336            shared_files,
337            subdirs,
338            errors,
339        })
340    }
341}