ahtml/util/
autolink.rs

1use crate::{att, ASlice, HtmlAllocator, Node};
2use anyhow::Result;
3
4use chj_util::nowarn as warn;
5
6fn find_from(s: &str, pos: usize, needle: &str) -> Option<usize> {
7    (&s[pos..]).find(needle).map(|p| p + pos)
8}
9
10// Characters that are OK in http/https URLs *for auto-detection
11// purposes*
12fn is_url_character(c: char) -> bool {
13    c.is_ascii_alphanumeric()
14        || c == ':'
15        || c == '/'
16        || c == '.'
17        || c == '-'
18        || c == '%'
19        || c == '='
20}
21
22// Characters unlikely to be part of an URL *at its end* (but OK in
23// the middle)
24fn is_character_to_exclude_at_end(c: char) -> bool {
25    c == ':' || c == '.' || c == '-' || c == '%' || c == '='
26}
27
28// Characters expected to be in front (left) of an URL.
29fn is_character_in_front(c: Option<char>) -> bool {
30    // word boundary
31    if let Some(c) = c {
32        !c.is_ascii_alphanumeric()
33    } else {
34        true
35    }
36}
37
38/// Convert `text` to an HTML node list usable as HTML Element body,
39/// creating links around "http" and "https" URLs contained in `text`.
40pub fn autolink(html: &HtmlAllocator, text: &str) -> Result<ASlice<Node>> {
41    warn!("autolink for {text:?}");
42    let mut nodes = html.new_vec();
43    let mut pos_done = 0;
44    let mut pos_remainder = 0;
45    while let Some(pos) = find_from(text, pos_remainder, "http") {
46        warn!("found pos={pos}");
47        let pos_rest = pos + 4; // after the 'http'
48
49        let mut backwardsiter = (&text[0..pos]).chars().rev();
50        if !is_character_in_front(backwardsiter.next()) {
51            warn!("nope 0");
52            pos_remainder = pos_rest;
53            continue;
54        }
55
56        let mut restiter = (&text[pos_rest..]).chars();
57        let c0 = restiter.next();
58        let c1 = restiter.next();
59        let skip_len = match c0 {
60            Some('s') => match c1 {
61                Some(':') => 2,
62                _ => {
63                    warn!("nope 1");
64                    pos_remainder = pos_rest;
65                    continue;
66                }
67            },
68            Some(':') => 1,
69            _ => {
70                warn!("nope 2");
71                pos_remainder = pos_rest;
72                continue;
73            }
74        };
75        let pos_rest = pos_rest + skip_len;
76        if !(&text[pos_rest..]).starts_with("//") {
77            warn!("nope: no // in {:?}", &text[pos_rest..]);
78            pos_remainder = pos_rest;
79            continue;
80        }
81        let pos_rest = pos_rest + 2;
82        let (one_before_end, end); // in text
83        'find_end: loop {
84            // loop bc labels on blocks are unstable
85            let mut last_i = 0;
86            for (i, c) in (&text[pos_rest..]).char_indices() {
87                if !is_url_character(c) {
88                    one_before_end = pos_rest + last_i;
89                    end = pos_rest + i;
90                    break 'find_end;
91                }
92                last_i = i;
93            }
94            one_before_end = pos_rest + last_i;
95            end = text.len();
96            break;
97        }
98
99        if one_before_end == end {
100            warn!("nope: nothing after //");
101            pos_remainder = pos_rest;
102            continue;
103        }
104
105        let char_before_end = text[one_before_end..]
106            .chars()
107            .next()
108            .expect("char is there because we maintained one_before_end to point there"); // XXX ah, but 0 ?
109        let real_end = if is_character_to_exclude_at_end(char_before_end) {
110            one_before_end
111        } else {
112            end
113        };
114        let url = &text[pos..real_end];
115
116        if pos - pos_done > 0 {
117            nodes.push(html.text(&text[pos_done..pos])?)?;
118        }
119        let link = html.a([att("href", url)], [html.text(url)?])?;
120        nodes.push(link)?;
121        warn!("pushed node: {}", html.to_html_string(link, false));
122
123        pos_done = real_end;
124        pos_remainder = real_end;
125    }
126
127    if pos_done < text.len() {
128        nodes.push(html.text(&text[pos_done..])?)?;
129    }
130
131    Ok(nodes.as_slice())
132}
133
134#[cfg(test)]
135mod tests {
136    use crate::Print;
137
138    use super::*;
139
140    #[test]
141    fn t_find_from() {
142        assert_eq!(find_from("hello world", 0, "World"), None);
143        assert_eq!(find_from("hello world", 0, "world"), Some(6));
144        assert_eq!(find_from("hello world", 5, "world"), Some(6));
145        assert_eq!(find_from("hello world", 6, "world"), Some(6));
146        assert_eq!(find_from("hello world", 7, "world"), None);
147        assert_eq!(find_from("hello world in many worlds", 3, "world"), Some(6));
148        assert_eq!(
149            find_from("hello world in many worlds", 7, "world"),
150            Some(20)
151        );
152    }
153
154    fn t(s: &str) -> String {
155        let html = HtmlAllocator::new(1000, std::sync::Arc::new(""));
156        let slice = autolink(&html, s).unwrap();
157        slice.to_html_fragment_string(&html).unwrap()
158    }
159
160    #[test]
161    fn t_() {
162        assert_eq!(t("http:// "), "http:// ");
163        assert_eq!(t("http://"), "http://");
164        assert_eq!(t(""), "");
165        assert_eq!(t("foo"), "foo");
166        assert_eq!(t("http"), "http");
167        assert_eq!(t("https"), "https");
168        assert_eq!(t("http:"), "http:");
169        assert_eq!(t("http:/"), "http:/");
170        assert_eq!(t("http://foo"), "<a href=\"http://foo\">http://foo</a>");
171        assert_eq!(
172            t("There's http://foo.com there."),
173            "There&#39;s <a href=\"http://foo.com\">http://foo.com</a> there."
174        );
175        assert_eq!(
176            t("There's http://foo.com. Yes."),
177            "There&#39;s <a href=\"http://foo.com\">http://foo.com</a>. Yes."
178        );
179        assert_eq!(
180            t("http://foo.com."),
181            "<a href=\"http://foo.com\">http://foo.com</a>."
182        );
183        assert_eq!(t("hmhttp://foo.com."), "hmhttp://foo.com.");
184    }
185}