evobench_tools/io_utils/
zstd_file.rs

1//! ZSTD compression and transparent decompression for files
2
3use std::{
4    ffi::{OsStr, OsString},
5    fs::File,
6    io::Read,
7    os::unix::fs::MetadataExt,
8    path::Path,
9    process::{ChildStdout, Command, Stdio},
10};
11
12use anyhow::{Context, Result, anyhow, bail};
13use cj_path_util::unix::polyfill::add_extension;
14use memmap2::{Mmap, MmapOptions};
15use ruzstd::{FrameDecoder, StreamingDecoder};
16
17use crate::{ctx, io_utils::tempfile_utils::TempfileOptions};
18
19// For decompression; compression is always done via tool.
20const USING_EXTERNAL_TOOL: bool = false;
21
22#[derive(Debug, PartialEq)]
23enum Extension {
24    ZStd,
25    Other,
26}
27
28// Expect a file extension in `path`, return whether it is "zstd" or
29// the `expected_suffix`. Anything else yields an error. If not given,
30// accepts any suffix (but one is required).
31fn file_extension<P: AsRef<Path>>(path: P, expected_suffix: Option<&str>) -> Result<Extension> {
32    let path = path.as_ref();
33    let ext = path.extension().ok_or_else(|| {
34        let _hold;
35        let extension_msg = if let Some(expected_suffix) = expected_suffix {
36            _hold = format!("{expected_suffix:?}");
37            &_hold
38        } else {
39            "any extension"
40        };
41        anyhow!("missing file extension, expecting {extension_msg} or \"zstd\": {path:?}")
42    })?;
43
44    match ext.to_string_lossy().as_ref() {
45        "zstd" => {
46            if let Some(expected_suffix) = expected_suffix {
47                let stem = path.with_extension("");
48                let ext2 = stem.extension().ok_or_else(|| {
49                    anyhow!(
50                        "missing second file extension, after \"zstd\", \
51                     expecting {expected_suffix:?}: {path:?}"
52                    )
53                })?;
54                match ext2.to_string_lossy().as_ref() {
55                    s if &*s == expected_suffix => Ok(Extension::ZStd),
56                    _ => bail!(
57                        "unknown second file extension {ext2:?} after \"zstd\", \
58                     expecting {expected_suffix:?}: {path:?}"
59                    ),
60                }
61            } else {
62                Ok(Extension::ZStd)
63            }
64        }
65        ext_str => {
66            if let Some(expected_suffix) = expected_suffix {
67                if ext_str == expected_suffix {
68                    Ok(Extension::Other)
69                } else {
70                    bail!(
71                        "unknown file extension {ext:?}, expecting {expected_suffix:?} \
72                     or \"zstd\": {path:?}"
73                    )
74                }
75            } else {
76                Ok(Extension::Other)
77            }
78        }
79    }
80}
81
82#[test]
83fn t_file_extension() {
84    use Extension::*;
85    let ok = |a: &str, b: &'static str| {
86        file_extension(a, Some(b)).expect("test call should not give an error")
87    };
88    let err = |a: &str, b: &'static str| {
89        file_extension(a, Some(b))
90            .err()
91            .expect("test call should give an error")
92            .to_string()
93    };
94    assert_eq!(ok("foo.x", "x"), Other);
95    assert_eq!(ok("foo.x.zstd", "x"), ZStd);
96    assert_eq!(ok("foo.z.x", "x"), Other);
97    assert_eq!(ok("foo.z.x.zstd", "x"), ZStd);
98    assert_eq!(
99        err("foo.x", "y"),
100        "unknown file extension \"x\", expecting \"y\" or \"zstd\": \"foo.x\""
101    );
102    assert_eq!(
103        err("foo.x.zstd", "y"),
104        "unknown second file extension \"x\" after \"zstd\", expecting \"y\": \"foo.x.zstd\""
105    );
106    assert_eq!(
107        err("foo.zstd", "y"),
108        "missing second file extension, after \"zstd\", expecting \"y\": \"foo.zstd\""
109    );
110    assert_eq!(
111        err("foo", "y"),
112        "missing file extension, expecting \"y\" or \"zstd\": \"foo\""
113    );
114}
115
116pub trait SendRead: Read + Send {}
117
118impl SendRead for StreamingDecoder<std::fs::File, FrameDecoder> {}
119impl SendRead for ChildStdout {}
120impl SendRead for File {}
121
122/// Transparently decompress zstd files if they have a .zstd suffix;
123/// after that, expecting the `expected_suffix` (which must be given
124/// *without* a leading dot) if given.
125pub fn decompressed_file(path: &Path, expected_suffix: Option<&str>) -> Result<Box<dyn SendRead>> {
126    let ext = file_extension(path, expected_suffix)?;
127
128    let file_open = || File::open(path).with_context(|| anyhow!("opening file {path:?}"));
129
130    match ext {
131        Extension::ZStd => {
132            if USING_EXTERNAL_TOOL {
133                let mut c = Command::new("zstd");
134                let args: Vec<OsString> = vec!["-dcf".into(), "--".into(), path.into()];
135                c.args(args);
136                c.stdout(Stdio::piped());
137                let child = c.spawn().map_err(ctx!("spawning command {c:?}"))?;
138                Ok(Box::new(child.stdout.expect("present since configured")))
139            } else {
140                let input = file_open()?;
141                Ok(Box::new(
142                    StreamingDecoder::new(input).map_err(ctx!("zstd-decoding {path:?}"))?,
143                ))
144            }
145        }
146        Extension::Other => Ok(Box::new(file_open()?)),
147    }
148}
149
150/// Open the file as a mmap. `.zstd` files are first decompressed to
151/// `uncompressed_path` (or `.zstd.uncompressed` if not given) if the
152/// path does not exist already. The MMap is created using 2MB
153/// huge-pages on Linux. The usual caveats for memory maps applies:
154/// modifications of the file while using the map can change the data
155/// during parsing, which is safe or not depending on the
156/// parser. Truncating the file while accessing it will segfault the
157/// process. Leaving it marked safe here, for now.
158pub fn decompressed_file_mmap(
159    path: &Path,
160    uncompressed_path: Option<&Path>,
161    expected_suffix: Option<&str>,
162) -> Result<Mmap> {
163    let ext = file_extension(path, expected_suffix)?;
164
165    let file_open =
166        |path: &Path| File::open(path).with_context(|| anyhow!("opening file {path:?}"));
167
168    let tmp;
169    let uncompressed_path = match ext {
170        Extension::ZStd => {
171            let uncompressed_path = if let Some(uncompressed_path) = uncompressed_path {
172                uncompressed_path.to_owned()
173            } else {
174                add_extension(path, "uncompressed")
175                    .ok_or_else(|| anyhow!("appending extension to {path:?}"))?
176            };
177            if !uncompressed_path.exists() {
178                let tmp = TempfileOptions {
179                    target_path: uncompressed_path.clone(),
180                    retain_tempfile: false,
181                    migrate_access: false,
182                }
183                .tempfile()?;
184
185                let mut c = Command::new("zstd");
186                let args: Vec<OsString> = vec![
187                    "-df".into(),
188                    "--quiet".into(),
189                    "-o".into(),
190                    tmp.temp_path().into(),
191                    "--".into(),
192                    path.into(),
193                ];
194                c.args(args);
195                let mut child = c.spawn().map_err(ctx!("spawning command {c:?}"))?;
196                let status = child.wait()?;
197                if !status.success() {
198                    bail!("{c:?} failed: {status}");
199                }
200                tmp.finish()?;
201            }
202            tmp = uncompressed_path;
203            &tmp
204        }
205        Extension::Other => path,
206    };
207
208    let input = file_open(&uncompressed_path)?;
209
210    let meta = input.metadata()?;
211    let size: usize = meta.size().try_into()?;
212    unsafe {
213        // As safe as the function docs says
214        MmapOptions::new().huge(Some(21)).len(size).map(&input)
215    }
216    .map_err(ctx!("mmap for file {uncompressed_path:?}"))
217}
218
219/// If quiet is false, lets messaging by the `zstd` tool show up on
220/// stdout/err. If true, silences reporting output but captures error
221/// messages and reports those in the resulting error.
222pub fn compress_file(source_path: &Path, target_path: &Path, quiet: bool) -> Result<()> {
223    let mut c = Command::new("zstd");
224    if quiet {
225        c.arg("--quiet");
226        c.stdout(Stdio::piped());
227        c.stderr(Stdio::piped());
228    }
229    let args: &[&OsStr] = &[
230        "-o".as_ref(),
231        // XX: is this argument position safe against option injection?
232        target_path.as_ref(),
233        "--".as_ref(),
234        source_path.as_ref(),
235    ];
236    c.args(args);
237    let output = c.output().map_err(ctx!("running command {c:?}"))?;
238    let status = output.status;
239    if status.success() {
240        Ok(())
241    } else {
242        let outputs = if quiet {
243            let mut outputs = String::from_utf8_lossy(&output.stdout).into_owned();
244            outputs.push_str(&String::from_utf8_lossy(&output.stderr));
245            format!("{outputs:?}")
246        } else {
247            "not captured".into()
248        };
249        bail!("running zstd {args:?}: {status} with outputs {outputs}")
250    }
251}