ahtml/
lib.rs

1//! Html dom abstraction, with runtime typing.
2
3pub mod allocator;
4pub mod arc_util;
5pub mod flat;
6pub mod more_vec;
7pub mod myfrom;
8pub mod stillvec;
9pub mod util;
10
11use ahtml_html::meta::{read_meta_db, ElementMeta, MetaDb};
12use allocator::Context;
13pub use allocator::{
14    AId, ASlice, AVec, AllocatorType, Element, HtmlAllocator, HtmlAllocatorPool, Node, SerHtmlFrag,
15    ToASlice,
16};
17use anyhow::{bail, Result};
18use kstring::KString;
19use lazy_static::lazy_static;
20use std::{cell::RefMut, io::Write};
21
22use crate::myfrom::MyFrom;
23
24pub const NBSP: &str = "\u{00A0}";
25
26// https://www.w3.org/International/questions/qa-byte-order-mark#problems
27const BOM: &str = "\u{FEFF}";
28#[cfg(test)]
29#[test]
30fn t_file_encoding() {
31    assert_eq!(BOM.as_bytes(), &[0xEF, 0xBB, 0xBF]);
32}
33
34const DOCTYPE: &str = "<!DOCTYPE html>\n";
35
36pub trait Print {
37    /// Print serialized HTML.
38    fn print_html_fragment(&self, out: &mut impl Write, allocator: &HtmlAllocator) -> Result<()>;
39
40    /// Print plain text, completely *ignoring* HTML markup. Can
41    /// currently only give an error if encountering preserialized
42    /// HTML.
43    fn print_plain(&self, out: &mut String, allocator: &HtmlAllocator) -> Result<()>;
44
45    fn to_html_fragment_string(&self, allocator: &HtmlAllocator) -> Result<String> {
46        let mut s = Vec::new();
47        self.print_html_fragment(&mut s, allocator)?;
48        Ok(unsafe {
49            // Safe because v was filled from bytes derived from
50            // String/str values and byte string literals (typed in via
51            // Emacs) that were simply concatenated together.
52            String::from_utf8_unchecked(s)
53        })
54    }
55
56    fn to_plain_string(&self, allocator: &HtmlAllocator) -> Result<String> {
57        let mut s = String::new();
58        self.print_plain(&mut s, allocator)?;
59        Ok(s)
60    }
61
62    // fn to_kstring -- don't provide this as it will be confused for
63    // preserialized HTML and that currently only supports elements,
64    // not slices.
65}
66
67impl Print for AId<Node> {
68    fn print_html_fragment(&self, out: &mut impl Write, allocator: &HtmlAllocator) -> Result<()> {
69        let node = allocator
70            .get_node(*self)
71            .expect("id should resolve: {self:?}");
72        node.print_html_fragment(out, allocator)
73    }
74
75    fn print_plain(&self, out: &mut String, allocator: &HtmlAllocator) -> Result<()> {
76        let node = allocator
77            .get_node(*self)
78            .expect("id should resolve: {self:?}");
79        node.print_plain(out, allocator)
80    }
81}
82
83lazy_static! {
84    pub static ref METADB: MetaDb = read_meta_db().unwrap();
85}
86
87impl HtmlAllocatorPool {
88    /// Make a new allocator pool, if `verify` is true, for
89    /// `HtmlAllocator`s with the default HTML5 structure
90    /// checking. Uses a high re-use count; for custom
91    /// `allocator_max_use_count` values, use
92    /// `HtmlAllocatorPool::new_with_metadb` instead.
93    pub fn new(max_allocations: u32, verify: bool, context: Context) -> Self {
94        HtmlAllocatorPool::new_with_metadb(
95            // allocator_max_use_count: u16,
96            60000,
97            max_allocations,
98            if verify { Some(&*METADB) } else { None },
99            context,
100        )
101    }
102}
103
104impl HtmlAllocator {
105    pub fn new(max_allocations: u32, context: Context) -> Self {
106        Self::new_with_metadb(max_allocations, Some(&*METADB), context)
107    }
108}
109
110fn ks<T>(s: T) -> KString
111where
112    KString: MyFrom<T>,
113{
114    KString::myfrom(s)
115}
116
117pub fn att<T, U>(key: T, val: U) -> Option<(KString, KString)>
118where
119    KString: MyFrom<T> + MyFrom<U>,
120{
121    Some((ks(key), ks(val)))
122}
123
124pub fn opt_att<T, U>(key: T, val: Option<U>) -> Option<(KString, KString)>
125where
126    KString: MyFrom<T> + MyFrom<U>,
127{
128    val.map(|val| (ks(key), ks(val)))
129}
130
131impl<T> ToASlice<T> for ASlice<T> {
132    fn to_aslice(self, _allocator: &HtmlAllocator) -> Result<ASlice<T>> {
133        Ok(self)
134    }
135}
136impl<T> ToASlice<T> for &ASlice<T> {
137    fn to_aslice(self, _allocator: &HtmlAllocator) -> Result<ASlice<T>> {
138        Ok(*self)
139    }
140}
141impl<'a, T: AllocatorType> ToASlice<T> for AVec<'a, T> {
142    fn to_aslice(self, _allocator: &HtmlAllocator) -> Result<ASlice<T>> {
143        Ok(self.as_slice())
144    }
145}
146
147impl ToASlice<Node> for AId<Node> {
148    fn to_aslice(self, html: &HtmlAllocator) -> Result<ASlice<Node>> {
149        let mut vec = html.new_vec();
150        vec.push(self)?;
151        Ok(vec.as_slice())
152    }
153}
154
155impl ToASlice<Node> for &[AId<Node>] {
156    fn to_aslice(self, allocator: &HtmlAllocator) -> Result<ASlice<Node>> {
157        let mut vec = allocator.new_vec();
158        for val in self {
159            vec.push(*val)?;
160        }
161        Ok(vec.as_slice())
162    }
163}
164
165impl ToASlice<Node> for Vec<AId<Node>> {
166    fn to_aslice(self, allocator: &HtmlAllocator) -> Result<ASlice<Node>> {
167        self.as_slice().to_aslice(allocator)
168    }
169}
170
171// Take ownership of an array (best syntax)
172impl<const N: usize> ToASlice<Node> for [AId<Node>; N] {
173    fn to_aslice(self, allocator: &HtmlAllocator) -> Result<ASlice<Node>> {
174        // Instantiated for every length, need to keep this short! --
175        // except if we want to avoid the swap, there is no length
176        // independent way to do it, hence have to be fat,
177        // bummer. FUTURE: optimize via unsafe memcpy (if the type
178        // isn't pinned etc.).
179        let mut vec = allocator.new_vec();
180        for val in self {
181            vec.push(val)?;
182        }
183        Ok(vec.as_slice())
184    }
185}
186
187// Take ownership of an array (best syntax, and allows to avoid the
188// need for swap), version for attributes:
189
190// Disabled for now, because with stable Rust we can't resolve the
191// ambiguity with empty arrays without explicit typing
192// impl<'a, const N: usize> ToASlice<(KString, KString)> for [(KString, KString); N] {
193//     fn to_aslice(
194//         self,
195//         allocator: &HtmlAllocator
196//     ) -> Result<ASlice<(KString, KString)>>
197//     {
198//         // Instantiated for every length, need to keep this short! --
199//         // except if we want to avoid the swap, there is no length
200//         // independent way to do it, hence have to be fat,
201//         // bummer. FUTURE: optimize via unsafe memcpy (if the type
202//         // isn't pinned etc.).
203//         let mut vec = allocator.new_vec();
204//         for val in self {
205//             let id_ = allocator.new_attribute(val)?;
206//             vec.push(id_)?;
207//         }
208//         Ok(vec.as_slice())
209//     }
210// }
211
212// Same for values returned by `att` and `opt_att`:
213impl<'a, const N: usize> ToASlice<(KString, KString)> for [Option<(KString, KString)>; N] {
214    fn to_aslice(self, allocator: &HtmlAllocator) -> Result<ASlice<(KString, KString)>> {
215        let mut vec = allocator.new_vec();
216        for opt_val in self {
217            if let Some(val) = opt_val {
218                let id_ = allocator.new_attribute(val)?;
219                vec.push(id_)?;
220            }
221        }
222        Ok(vec.as_slice())
223    }
224}
225
226impl HtmlAllocator {
227    /// `bytes` must represent proper UTF-8,
228    /// e.g. string.as_bytes(). The resulting reference must be
229    /// dropped before calling html_escape again, or there will be a
230    /// panic.
231    pub fn html_escape(&self, bytes: &[u8]) -> RefMut<'_, Vec<u8>> {
232        let mut bufref = self.html_escape_tmp.borrow_mut();
233        let append = |buf: &mut Vec<u8>, bstr: &[u8]| {
234            // XX wanted to use copy_from_slice. But how to reserve
235            // space for it efficiently?
236            buf.extend(bstr.iter());
237        };
238        let buf = &mut *bufref;
239        buf.clear();
240        for b in bytes {
241            match b {
242                b'&' => append(buf, b"&amp;"),
243                b'<' => append(buf, b"&lt;"),
244                b'>' => append(buf, b"&gt;"),
245                b'"' => append(buf, b"&quot;"),
246                b'\'' => append(buf, b"&#39;"),
247                _ => buf.push(*b),
248            }
249        }
250        bufref
251    }
252
253    pub fn print_html_fragment(&self, id_: AId<Node>, out: &mut impl Write) -> Result<()> {
254        let noderef = self.get_node(id_).expect(
255            // (Why does this return a Result even ? Aha, for
256            // invalid dynamic borrow. Should this be changed to panic,
257            // too?)
258            "invalid generation/allocator_id leads to panic, hence this should \
259             always resolve",
260        );
261        match &*noderef {
262            Node::Element(_) => (),
263            Node::String(_) => {
264                // eprintln!("toplevel print_html: Warning: printing of a \
265                //            Node::String")
266            }
267            Node::Preserialized(_) => eprintln!(
268                "toplevel print_html: Warning: printing of a \
269                           Node::Preserialized"
270            ),
271            Node::None => {}
272        }
273        noderef.print_html_fragment(out, self)
274    }
275
276    pub fn print_html_document(&self, id_: AId<Node>, out: &mut impl Write) -> Result<()> {
277        // Add a byte-order mark (BOM) to make sure the output is read
278        // correctly from files, too (e.g. by Safari).
279        out.write_all(BOM.as_bytes())?;
280        out.write_all(DOCTYPE.as_bytes())?;
281        self.print_html_fragment(id_, out)
282    }
283
284    pub fn to_html_string(&self, id: AId<Node>, want_doctype: bool) -> String {
285        let mut v = Vec::new();
286        if want_doctype {
287            self.print_html_document(id, &mut v)
288        } else {
289            self.print_html_fragment(id, &mut v)
290        }
291        .expect("no I/O errors can happen");
292
293        // Safe because v was filled from bytes derived from
294        // String/str values and byte string literals (typed in via
295        // Emacs, but rustc would refuse non-UTF8 sequences anyway?)
296        // that were simply concatenated together.
297        unsafe { String::from_utf8_unchecked(v) }
298    }
299
300    /// Returns an error if id doesn't refer to an Element Node.
301    pub fn preserialize(&self, id: AId<Node>) -> Result<SerHtmlFrag> {
302        let meta = {
303            let noderef = self.get_node(id).expect(
304                // (Why does this return a Result even ? Aha, for
305                // invalid dynamic borrow. Should this be changed to panic,
306                // too?)
307                "invalid generation/allocator_id leads to panic, hence this should \
308                 always resolve",
309            );
310            let n = &*noderef;
311            match n {
312                Node::Element(e) => e.meta,
313                _ => bail!("can only preserialize element nodes"),
314            }
315        };
316        let s = self.to_html_string(id, false);
317        Ok(SerHtmlFrag {
318            meta,
319            string: s.into(),
320        })
321    }
322
323    // 2x partial copy-paste
324
325    pub fn print_plain(&self, id: AId<Node>, out: &mut String) -> Result<()> {
326        let noderef = self.get_node(id).expect(
327            // (Why does this return a Result even ? Aha, for
328            // invalid dynamic borrow. Should this be changed to panic,
329            // too?)
330            "invalid generation/allocator_id leads to panic, hence this should \
331             always resolve",
332        );
333        match &*noderef {
334            Node::Element(_) => (),
335            Node::String(_) => {
336                // eprintln!("toplevel print_plain: Warning: printing of a \
337                //            Node::String")
338            }
339            Node::Preserialized(_) =>
340            // XX eh, that won't work anyway, error later on?
341            {
342                eprintln!(
343                    "toplevel print_plain: Warning: printing of a \
344                           Node::Preserialized"
345                )
346            }
347            Node::None => {}
348        }
349        noderef.print_plain(out, self)
350    }
351
352    /// If you need this to strip html and use the result as AId, be
353    /// sure to use `to_plain_string_aid` instead, as that optimizes
354    /// the case of `id` already representing a string.
355    pub fn to_plain_string(&self, id: AId<Node>) -> Result<KString> {
356        let mut v = String::new();
357        self.print_plain(id, &mut v)?;
358        Ok(KString::from_string(v))
359    }
360
361    /// Like `to_plain_string` but returns a string node (or empty
362    /// node if the input is empty) and optimizes (and silences) the
363    /// case where `id` already represents a string.
364    pub fn to_plain_string_aid(&self, id: AId<Node>) -> Result<AId<Node>> {
365        let noderef = self.get_node(id).expect(
366            // (Why does this return a Result even ? Aha, for
367            // invalid dynamic borrow. Should this be changed to panic,
368            // too?)
369            "invalid generation/allocator_id leads to panic, hence this should \
370             always resolve",
371        );
372        match &*noderef {
373            Node::Element(_) => {
374                let mut v = String::new();
375                self.print_plain(id, &mut v)?;
376                self.string(v)
377            }
378            Node::String(_) => Ok(id),
379            // OK to give an error right away? *Would* error out
380            // anyway on print_plain ('though'), right?
381            Node::Preserialized(_) => bail!("can't currently strip markup from preserialized HTML"),
382            Node::None => Ok(id), // XX is this OK or do we promise to return a string node?
383        }
384    }
385}
386
387include!("../includes/ahtml_elements_include.rs");
388
389impl<T: AllocatorType> Print for ASlice<T> {
390    fn print_html_fragment(&self, out: &mut impl Write, allocator: &HtmlAllocator) -> Result<()> {
391        for node in self.iter_node(allocator) {
392            node.print_html_fragment(out, allocator)?;
393        }
394        Ok(())
395    }
396
397    fn print_plain(&self, out: &mut String, allocator: &HtmlAllocator) -> Result<()> {
398        for node in self.iter_node(allocator) {
399            node.print_plain(out, allocator)?;
400        }
401        Ok(())
402    }
403}
404
405impl Print for (KString, KString) {
406    fn print_html_fragment(&self, out: &mut impl Write, allocator: &HtmlAllocator) -> Result<()> {
407        out.write_all(self.0.as_bytes())?; // XX no escape ever needed?
408        out.write_all(b"=\"")?;
409        out.write_all(&allocator.html_escape(self.1.as_bytes()))?;
410        out.write_all(b"\"")?;
411        Ok(())
412    }
413
414    fn print_plain(&self, _out: &mut String, _allocator: &HtmlAllocator) -> Result<()> {
415        panic!("attributes are never printed in print_plain for Node:s")
416    }
417}
418
419impl Print for Node {
420    fn print_html_fragment(&self, out: &mut impl Write, allocator: &HtmlAllocator) -> Result<()> {
421        Ok(match self {
422            Node::Element(e) => e.print_html_fragment(out, allocator)?,
423            Node::String(s) => out.write_all(&allocator.html_escape(s.as_bytes()))?,
424            Node::Preserialized(ser) => out.write_all(ser.as_str().as_bytes())?,
425            Node::None => (),
426        })
427    }
428    fn print_plain(&self, out: &mut String, allocator: &HtmlAllocator) -> Result<()> {
429        match self {
430            Node::Element(e) => e.print_plain(out, allocator),
431            Node::String(s) => Ok(out.push_str(s.as_str())),
432            Node::Preserialized(_) =>
433            // would require re-parsing
434            {
435                bail!(
436                    "print_plain: cannot (currently) print pre-serialized HTML \
437                       as plain text"
438                )
439            }
440            Node::None => Ok(()),
441        }
442    }
443}
444
445impl Print for Element {
446    fn print_html_fragment(&self, out: &mut impl Write, allocator: &HtmlAllocator) -> Result<()> {
447        let meta = self.meta;
448        // meta.has_global_attributes XX ? only for verification?
449        out.write_all(b"<")?;
450        out.write_all(meta.tag_name.as_bytes())?;
451        for att in self.attr.iter_att(allocator) {
452            out.write_all(b" ")?;
453            att.print_html_fragment(out, allocator)?;
454        }
455        out.write_all(b">")?;
456        self.body.print_html_fragment(out, allocator)?;
457        if meta.has_closing_tag {
458            out.write_all(b"</")?;
459            out.write_all(meta.tag_name.as_bytes())?;
460            out.write_all(b">")?;
461        }
462        Ok(())
463    }
464
465    fn print_plain(&self, out: &mut String, allocator: &HtmlAllocator) -> Result<()> {
466        self.body.print_plain(out, allocator)
467    }
468}
469
470pub trait TryCollectBody {
471    fn try_collect_body(&mut self, html: &HtmlAllocator) -> Result<ASlice<Node>>;
472}
473
474impl<I: Iterator<Item = Result<AId<Node>>>> TryCollectBody for I {
475    fn try_collect_body(&mut self, html: &HtmlAllocator) -> Result<ASlice<Node>> {
476        let mut v = html.new_vec::<Node>();
477        for item in self {
478            v.push(item?)?;
479        }
480        Ok(v.as_slice())
481    }
482}
483
484// fn p_ab(attr: &[(KString, KString)], body: &[Node]) -> Element {
485// Element {
486//     meta: &P_META,
487//     attr: Some(Box::new(*attr)),
488//     body: Some(Box::new(*body)),
489// }
490// }
491
492// trait HtmlCheck {
493//     fn check(&self) -> Result<()>;
494// }
495
496// impl HtmlCheck for Node {
497//     fn check(&self) -> Result<()> {
498//         Ok(())
499//     }
500// }