Merge branch 'master' into dynamic-LUT

4 years ago · 7277fb5171
parent b91453dbb1 d1cde1dc1a
commit 7277fb5171
9 changed files with 446 additions and 186 deletions
--- a/examples/broken-link-callbacks.rs
+++ b/examples/broken-link-callbacks.rs
@ -0,0 +1,37 @@
+extern crate pulldown_cmark;
+
+use pulldown_cmark::{html, BrokenLink, Options, Parser};
+
+fn main() {
+    let input: &str = "Hello world, check out [my website][].";
+    println!("Parsing the following markdown string:\n{}", input);
+
+    // Setup callback that sets the URL and title when it encounters
+    // a reference to our home page.
+    let callback = &mut |broken_link: BrokenLink| {
+        if broken_link.reference == "my website" {
+            println!(
+                "Replacing the markdown `{}` of type {:?} with a working link",
+                &input[broken_link.span], broken_link.link_type,
+            );
+            Some(("http://example.com".into(), "my example website".into()))
+        } else {
+            None
+        }
+    };
+
+    // Create a parser with our callback function for broken links.
+    let parser = Parser::new_with_broken_link_callback(input, Options::empty(), Some(callback));
+
+    // Write to String buffer.
+    let mut html_output: String = String::with_capacity(input.len() * 3 / 2);
+    html::push_html(&mut html_output, parser);
+
+    // Check that the output is what we expected.
+    let expected_html: &str =
+        "<p>Hello world, check out <a href=\"http://example.com\" title=\"my example website\">my website</a>.</p>\n";
+    assert_eq!(expected_html, &html_output);
+
+    // Write result to stdout.
+    println!("\nHTML output:\n{}", &html_output);
+}
--- a/specs/regression.txt
+++ b/specs/regression.txt
@ -388,9 +388,9 @@ ISSUE #295
 <p>[foo]:</p>
 ````````````````````````````````

-ISSUE #298 (not yet fixed)
+ISSUE #298

-```````````````````````````````` DISABLED example
+```````````````````````````````` example
 > [foo
 > bar]: /url
 >
@ -697,7 +697,7 @@ ISSUE 398

 ISSUE 399

-```````````````````````````````` DISABLED example
+```````````````````````````````` example
 > Note: Though you should not rely on this, all pointers to <abbr
 > title="Dynamically Sized Types">DSTs</abbr> are currently twice the size of
 > the size of `usize` and have the same alignment.
@ -798,3 +798,19 @@ ISSUE 437
 .
 <p>&lt;foo</p>
 ````````````````````````````````
+
+Inline HTML stress test
+
+```````````````````````````````` example
+> > a <a href
+> > ="yo
+> > lo">
+.
+<blockquote>
+<blockquote>
+<p>a <a href
+="yo
+lo"></p>
+</blockquote>
+</blockquote>
+````````````````````````````````
--- a/src/escape.rs
+++ b/src/escape.rs
@ -18,13 +18,13 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.

-//! Utility functions for HTML escaping
+//! Utility functions for HTML escaping. Only useful when building your own
+//! HTML renderer.

-use std::io;
+use std::fmt::{Arguments, Write as FmtWrite};
+use std::io::{self, ErrorKind, Write};
 use std::str::from_utf8;

-use crate::html::StrWrite;
-
 #[rustfmt::skip]
 static HREF_SAFE: [u8; 128] = [
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -41,7 +41,66 @@ static HEX_CHARS: &[u8] = b"0123456789ABCDEF";
 static AMP_ESCAPE: &str = "&amp;";
 static SLASH_ESCAPE: &str = "&#x27;";

-pub(crate) fn escape_href<W>(mut w: W, s: &str) -> io::Result<()>
+/// This wrapper exists because we can't have both a blanket implementation
+/// for all types implementing `Write` and types of the for `&mut W` where
+/// `W: StrWrite`. Since we need the latter a lot, we choose to wrap
+/// `Write` types.
+pub struct WriteWrapper<W>(pub W);
+
+/// Trait that allows writing string slices. This is basically an extension
+/// of `std::io::Write` in order to include `String`.
+pub trait StrWrite {
+    fn write_str(&mut self, s: &str) -> io::Result<()>;
+
+    fn write_fmt(&mut self, args: Arguments) -> io::Result<()>;
+}
+
+impl<W> StrWrite for WriteWrapper<W>
+where
+    W: Write,
+{
+    #[inline]
+    fn write_str(&mut self, s: &str) -> io::Result<()> {
+        self.0.write_all(s.as_bytes())
+    }
+
+    #[inline]
+    fn write_fmt(&mut self, args: Arguments) -> io::Result<()> {
+        self.0.write_fmt(args)
+    }
+}
+
+impl<'w> StrWrite for String {
+    #[inline]
+    fn write_str(&mut self, s: &str) -> io::Result<()> {
+        self.push_str(s);
+        Ok(())
+    }
+
+    #[inline]
+    fn write_fmt(&mut self, args: Arguments) -> io::Result<()> {
+        // FIXME: translate fmt error to io error?
+        FmtWrite::write_fmt(self, args).map_err(|_| ErrorKind::Other.into())
+    }
+}
+
+impl<W> StrWrite for &'_ mut W
+where
+    W: StrWrite,
+{
+    #[inline]
+    fn write_str(&mut self, s: &str) -> io::Result<()> {
+        (**self).write_str(s)
+    }
+
+    #[inline]
+    fn write_fmt(&mut self, args: Arguments) -> io::Result<()> {
+        (**self).write_fmt(args)
+    }
+}
+
+/// Writes an href to the buffer, escaping href unsafe bytes.
+pub fn escape_href<W>(mut w: W, s: &str) -> io::Result<()>
 where
    W: StrWrite,
 {
@ -93,7 +152,7 @@ static HTML_ESCAPES: [&'static str; 5] = ["", "&quot;", "&amp;", "&lt;", "&gt;"]

 /// Writes the given string to the Write sink, replacing special HTML bytes
 /// (<, >, &, ") by escape sequences.
-pub(crate) fn escape_html<W: StrWrite>(w: W, s: &str) -> io::Result<()> {
+pub fn escape_html<W: StrWrite>(w: W, s: &str) -> io::Result<()> {
    #[cfg(all(target_arch = "x86_64", feature = "simd"))]
    {
        simd::escape_html(w, s)
@ -131,7 +190,7 @@ fn escape_html_scalar<W: StrWrite>(mut w: W, s: &str) -> io::Result<()> {

 #[cfg(all(target_arch = "x86_64", feature = "simd"))]
 mod simd {
-    use crate::html::StrWrite;
+    use super::StrWrite;
    use std::arch::x86_64::*;
    use std::io;
    use std::mem::size_of;
--- a/src/html.rs
+++ b/src/html.rs
@ -21,10 +21,9 @@
 //! HTML renderer that takes an iterator of events as input.

 use std::collections::HashMap;
-use std::fmt::{Arguments, Write as FmtWrite};
-use std::io::{self, ErrorKind, Write};
+use std::io::{self, Write};

-use crate::escape::{escape_href, escape_html};
+use crate::escape::{escape_href, escape_html, StrWrite, WriteWrapper};
 use crate::parse::Event::*;
 use crate::parse::{Alignment, CodeBlockKind, Event, LinkType, Tag};
 use crate::strings::CowStr;
@ -34,64 +33,6 @@ enum TableState {
    Body,
 }

-/// This wrapper exists because we can't have both a blanket implementation
-/// for all types implementing `Write` and types of the for `&mut W` where
-/// `W: StrWrite`. Since we need the latter a lot, we choose to wrap
-/// `Write` types.
-struct WriteWrapper<W>(W);
-
-/// Trait that allows writing string slices. This is basically an extension
-/// of `std::io::Write` in order to include `String`.
-pub(crate) trait StrWrite {
-    fn write_str(&mut self, s: &str) -> io::Result<()>;
-
-    fn write_fmt(&mut self, args: Arguments) -> io::Result<()>;
-}
-
-impl<W> StrWrite for WriteWrapper<W>
-where
-    W: Write,
-{
-    #[inline]
-    fn write_str(&mut self, s: &str) -> io::Result<()> {
-        self.0.write_all(s.as_bytes())
-    }
-
-    #[inline]
-    fn write_fmt(&mut self, args: Arguments) -> io::Result<()> {
-        self.0.write_fmt(args)
-    }
-}
-
-impl<'w> StrWrite for String {
-    #[inline]
-    fn write_str(&mut self, s: &str) -> io::Result<()> {
-        self.push_str(s);
-        Ok(())
-    }
-
-    #[inline]
-    fn write_fmt(&mut self, args: Arguments) -> io::Result<()> {
-        // FIXME: translate fmt error to io error?
-        FmtWrite::write_fmt(self, args).map_err(|_| ErrorKind::Other.into())
-    }
-}
-
-impl<W> StrWrite for &'_ mut W
-where
-    W: StrWrite,
-{
-    #[inline]
-    fn write_str(&mut self, s: &str) -> io::Result<()> {
-        (**self).write_str(s)
-    }
-
-    #[inline]
-    fn write_fmt(&mut self, args: Arguments) -> io::Result<()> {
-        (**self).write_fmt(args)
-    }
-}
-
 struct HtmlWriter<'a, I, W> {
    /// Iterator supplying events.
    iter: I,
--- a/src/lib.rs
+++ b/src/lib.rs
@ -59,7 +59,7 @@ extern crate bitflags;
 extern crate unicase;

 mod entities;
-mod escape;
+pub mod escape;
 mod linklabel;
 mod parse;
 mod puncttable;
@ -71,6 +71,6 @@ mod tree;
 mod simd;

 pub use crate::parse::{
-    Alignment, CodeBlockKind, Event, LinkType, OffsetIter, Options, Parser, Tag,
+    Alignment, BrokenLink, CodeBlockKind, Event, LinkType, OffsetIter, Options, Parser, Tag,
 };
 pub use crate::strings::{CowStr, InlineStr};
--- a/src/parse.rs
+++ b/src/parse.rs
@ -215,7 +215,8 @@ enum ItemBody {
    MaybeCode(usize, bool), // number of backticks, preceeded by backslash
    MaybeHtml,
    MaybeLinkOpen,
-    MaybeLinkClose,
+    // bool indicates whether or not the preceeding section could be a reference
+    MaybeLinkClose(bool),
    MaybeImage,

    // These are inline items after resolution.
@ -233,6 +234,7 @@ enum ItemBody {
    FencedCodeBlock(CowIndex),
    IndentCodeBlock,
    Html,
+    OwnedHtml(CowIndex),
    BlockQuote,
    List(bool, u8, u64), // is_tight, list character, list start index
    ListItem(usize),     // indent level
@ -258,7 +260,7 @@ impl<'a> ItemBody {
            | ItemBody::MaybeHtml
            | ItemBody::MaybeCode(..)
            | ItemBody::MaybeLinkOpen
-            | ItemBody::MaybeLinkClose
+            | ItemBody::MaybeLinkClose(..)
            | ItemBody::MaybeImage => true,
            _ => false,
        }
@ -282,6 +284,12 @@ enum TableParseMode {
    Disabled,
 }

+pub struct BrokenLink<'a> {
+    pub span: std::ops::Range<usize>,
+    pub link_type: LinkType,
+    pub reference: &'a str,
+}
+
 /// State for the first parsing pass.
 ///
 /// The first pass resolves all block structure, generating an AST. Within a block, items
@ -443,7 +451,7 @@ impl<'a, 'b> FirstPass<'a, 'b> {
            }

            // Detect type 7
-            if let Some(_html_bytes) = scan_html_type_7(&bytes[(ix + 1)..]) {
+            if let Some(_html_bytes) = scan_html_type_7(&bytes[ix..]) {
                return self.parse_html_block_type_6_or_7(ix, remaining_space);
            }
        }
@ -851,7 +859,7 @@ impl<'a, 'b> FirstPass<'a, 'b> {
                        self.tree.append(Item {
                            start: ix,
                            end: ix + 1,
-                            body: ItemBody::MaybeLinkClose,
+                            body: ItemBody::MaybeLinkClose(true),
                        });
                        begin_text = ix + 1;
                        LoopInstruction::ContinueAndSkip(0)
@ -1743,8 +1751,8 @@ impl InlineStack {

 #[derive(Debug, Clone)]
 enum RefScan<'a> {
-    // label, next node index, source ix of label end
-    LinkLabel(CowStr<'a>, Option<TreeIndex>, usize),
+    // label, source ix of label end
+    LinkLabel(CowStr<'a>, usize),
    // contains next node index
    Collapsed(Option<TreeIndex>),
    Failed,
@ -1772,6 +1780,7 @@ fn scan_nodes_to_ix(
 fn scan_link_label<'text, 'tree>(
    tree: &'tree Tree<Item>,
    text: &'text str,
+    allow_footnote_refs: bool,
 ) -> Option<(usize, ReferenceLabel<'text>)> {
    let bytes = &text.as_bytes();
    if bytes.len() < 2 || bytes[0] != b'[' {
@ -1782,7 +1791,7 @@ fn scan_link_label<'text, 'tree>(
        let _ = scan_containers(tree, &mut line_start);
        Some(line_start.bytes_scanned())
    };
-    let pair = if b'^' == bytes[1] {
+    let pair = if allow_footnote_refs && b'^' == bytes[1] {
        let (byte_index, cow) = scan_link_label_rest(&text[2..], &linebreak_handler)?;
        (byte_index + 2, ReferenceLabel::Footnote(cow))
    } else {
@ -1796,6 +1805,7 @@ fn scan_reference<'a, 'b>(
    tree: &'a Tree<Item>,
    text: &'b str,
    cur: Option<TreeIndex>,
+    allow_footnote_refs: bool,
 ) -> RefScan<'b> {
    let cur_ix = match cur {
        None => return RefScan::Failed,
@ -1807,9 +1817,10 @@ fn scan_reference<'a, 'b>(
    if tail.starts_with(b"[]") {
        let closing_node = tree[cur_ix].next.unwrap();
        RefScan::Collapsed(tree[closing_node].next)
-    } else if let Some((ix, ReferenceLabel::Link(label))) = scan_link_label(tree, &text[start..]) {
-        let next_node = scan_nodes_to_ix(tree, cur, start + ix);
-        RefScan::LinkLabel(label, next_node, start + ix)
+    } else if let Some((ix, ReferenceLabel::Link(label))) =
+        scan_link_label(tree, &text[start..], allow_footnote_refs)
+    {
+        RefScan::LinkLabel(label, start + ix)
    } else {
        RefScan::Failed
    }
@ -2033,13 +2044,16 @@ pub(crate) fn create_lut(options: &Options) -> LookupTable {
    }
 }

+pub type BrokenLinkCallback<'a> =
+    Option<&'a mut dyn FnMut(BrokenLink) -> Option<(CowStr<'a>, CowStr<'a>)>>;
+
 /// Markdown event iterator.
-#[derive(Clone)]
 pub struct Parser<'a> {
    text: &'a str,
+    options: Options,
    tree: Tree<Item>,
    allocs: Allocations<'a>,
-    broken_link_callback: Option<&'a dyn Fn(&str, &str) -> Option<(String, String)>>,
+    broken_link_callback: BrokenLinkCallback<'a>,
    html_scan_guard: HtmlScanGuard,

    // used by inline passes. store them here for reuse
@ -2066,7 +2080,7 @@ impl<'a> Parser<'a> {
    pub fn new_with_broken_link_callback(
        text: &'a str,
        options: Options,
-        broken_link_callback: Option<&'a dyn Fn(&str, &str) -> Option<(String, String)>>,
+        broken_link_callback: BrokenLinkCallback<'a>,
    ) -> Parser<'a> {
        let lut = create_lut(&options);
        let first_pass = FirstPass::new(text, options, &lut);
@ -2077,6 +2091,7 @@ impl<'a> Parser<'a> {
        let html_scan_guard = Default::default();
        Parser {
            text,
+            options,
            tree,
            allocs,
            broken_link_callback,
@ -2139,17 +2154,23 @@ impl<'a> Parser<'a> {
                        }
                        continue;
                    } else {
-                        let inline_html = if let Some(next_ix) = next {
+                        let inline_html = next.and_then(|next_ix| {
                            self.scan_inline_html(
                                block_text.as_bytes(),
                                self.tree[next_ix].item.start,
                            )
-                        } else {
-                            None
-                        };
-                        if let Some(ix) = inline_html {
+                        });
+                        if let Some((span, ix)) = inline_html {
                            let node = scan_nodes_to_ix(&self.tree, next, ix);
-                            self.tree[cur_ix].item.body = ItemBody::Html;
+                            self.tree[cur_ix].item.body = if !span.is_empty() {
+                                let converted_string =
+                                    String::from_utf8(span).expect("invalid utf8");
+                                ItemBody::OwnedHtml(
+                                    self.allocs.allocate_cow(converted_string.into()),
+                                )
+                            } else {
+                                ItemBody::Html
+                            };
                            self.tree[cur_ix].item.end = ix;
                            self.tree[cur_ix].next = node;
                            prev = cur;
@ -2223,7 +2244,7 @@ impl<'a> Parser<'a> {
                        ty: LinkStackTy::Image,
                    });
                }
-                ItemBody::MaybeLinkClose => {
+                ItemBody::MaybeLinkClose(could_be_ref) => {
                    self.tree[cur_ix].item.body = ItemBody::Text;
                    if let Some(tos) = self.link_stack.pop() {
                        if tos.ty == LinkStackTy::Disabled {
@ -2259,23 +2280,53 @@ impl<'a> Parser<'a> {
                        } else {
                            // ok, so its not an inline link. maybe it is a reference
                            // to a defined link?
-                            let scan_result = scan_reference(&self.tree, block_text, next);
+                            let scan_result = scan_reference(
+                                &self.tree,
+                                block_text,
+                                next,
+                                self.options.contains(Options::ENABLE_FOOTNOTES),
+                            );
                            let (node_after_link, link_type) = match scan_result {
                                // [label][reference]
-                                RefScan::LinkLabel(_, next_node, _) => {
+                                RefScan::LinkLabel(_, end_ix) => {
+                                    // Toggle reference viability of the last closing bracket,
+                                    // so that we can skip it on future iterations in case
+                                    // it fails in this one. In particular, we won't call
+                                    // the broken link callback twice on one reference.
+                                    let reference_close_node =
+                                        scan_nodes_to_ix(&self.tree, next, end_ix - 1).unwrap();
+                                    self.tree[reference_close_node].item.body =
+                                        ItemBody::MaybeLinkClose(false);
+                                    let next_node = self.tree[reference_close_node].next;
+
                                    (next_node, LinkType::Reference)
                                }
-                                // []
-                                RefScan::Collapsed(next_node) => (next_node, LinkType::Collapsed),
+                                // [reference][]
+                                RefScan::Collapsed(next_node) => {
+                                    // This reference has already been tried, and it's not
+                                    // valid. Skip it.
+                                    if !could_be_ref {
+                                        continue;
+                                    }
+                                    (next_node, LinkType::Collapsed)
+                                }
                                // [shortcut]
                                //
                                // [shortcut]: /blah
-                                RefScan::Failed => (next, LinkType::Shortcut),
+                                RefScan::Failed => {
+                                    if !could_be_ref {
+                                        continue;
+                                    }
+                                    (next, LinkType::Shortcut)
+                                }
                            };

+                            // FIXME: references and labels are mixed in the naming of variables
+                            // below. Disambiguate!
+
                            // (label, source_ix end)
                            let label: Option<(ReferenceLabel<'a>, usize)> = match scan_result {
-                                RefScan::LinkLabel(l, _, end_ix) => {
+                                RefScan::LinkLabel(l, end_ix) => {
                                    Some((ReferenceLabel::Link(l), end_ix))
                                }
                                RefScan::Collapsed(..) | RefScan::Failed => {
@ -2284,6 +2335,7 @@ impl<'a> Parser<'a> {
                                    scan_link_label(
                                        &self.tree,
                                        &self.text[label_start..self.tree[cur_ix].item.end],
+                                        self.options.contains(Options::ENABLE_FOOTNOTES),
                                    )
                                    .map(|(ix, label)| (label, label_start + ix))
                                }
@ -2316,15 +2368,21 @@ impl<'a> Parser<'a> {
                                        (link_type, url, title)
                                    })
                                    .or_else(|| {
-                                        self.broken_link_callback
-                                            .and_then(|callback| {
-                                                // looked for matching definition, but didn't find it. try to fix
-                                                // link with callback, if it is defined
-                                                callback(link_label.as_ref(), link_label.as_ref())
-                                            })
-                                            .map(|(url, title)| {
-                                                (link_type.to_unknown(), url.into(), title.into())
-                                            })
+                                        match self.broken_link_callback.as_mut() {
+                                            Some(callback) => {
+                                                // Construct a BrokenLink struct, which will be passed to the callback
+                                                let broken_link = BrokenLink {
+                                                    span: (self.tree[tos.node].item.start)..end,
+                                                    link_type: link_type,
+                                                    reference: link_label.as_ref(),
+                                                };
+
+                                                callback(broken_link).map(|(url, title)| {
+                                                    (link_type.to_unknown(), url, title)
+                                                })
+                                            }
+                                            None => None,
+                                        }
                                    });

                                if let Some((def_link_type, url, title)) = type_url_title {
@ -2685,23 +2743,32 @@ impl<'a> Parser<'a> {
        }
    }

-    /// Returns the next byte offset on success.
-    fn scan_inline_html(&mut self, bytes: &[u8], ix: usize) -> Option<usize> {
+    /// On success, returns a buffer containing the inline html and byte offset.
+    /// When no bytes were skipped, the buffer will be empty and the html can be
+    /// represented as a subslice of the input string.
+    fn scan_inline_html(&mut self, bytes: &[u8], ix: usize) -> Option<(Vec<u8>, usize)> {
        let c = *bytes.get(ix)?;
        if c == b'!' {
-            scan_inline_html_comment(bytes, ix + 1, &mut self.html_scan_guard)
+            Some((
+                vec![],
+                scan_inline_html_comment(bytes, ix + 1, &mut self.html_scan_guard)?,
+            ))
        } else if c == b'?' {
-            scan_inline_html_processing(bytes, ix + 1, &mut self.html_scan_guard)
+            Some((
+                vec![],
+                scan_inline_html_processing(bytes, ix + 1, &mut self.html_scan_guard)?,
+            ))
        } else {
-            let i = scan_html_block_inner(
-                &bytes[ix..],
+            let (span, i) = scan_html_block_inner(
+                // Subtract 1 to include the < character
+                &bytes[(ix - 1)..],
                Some(&|_bytes| {
                    let mut line_start = LineStart::new(bytes);
                    let _ = scan_containers(&self.tree, &mut line_start);
                    line_start.bytes_scanned()
                }),
            )?;
-            Some(i + ix)
+            Some((span, i + ix - 1))
        }
    }

@ -2873,6 +2940,7 @@ fn item_to_event<'a>(item: Item, text: &'a str, allocs: &Allocations<'a>) -> Eve
        ItemBody::SynthesizeText(cow_ix) => return Event::Text(allocs[cow_ix].clone()),
        ItemBody::SynthesizeChar(c) => return Event::Text(c.into()),
        ItemBody::Html => return Event::Html(text[item.start..item.end].into()),
+        ItemBody::OwnedHtml(cow_ix) => return Event::Html(allocs[cow_ix].clone()),
        ItemBody::SoftBreak => return Event::SoftBreak,
        ItemBody::HardBreak => return Event::HardBreak,
        ItemBody::FootnoteReference(cow_ix) => {
@ -3132,7 +3200,7 @@ mod test {

    #[test]
    fn footnote_offsets() {
-        let range = Parser::new("Testing this[^1] out.\n\n[^1]: Footnote.")
+        let range = parser_with_extensions("Testing this[^1] out.\n\n[^1]: Footnote.")
            .into_offset_iter()
            .filter_map(|(ev, range)| match ev {
                Event::FootnoteReference(..) => Some(range),
@ -3186,6 +3254,16 @@ mod test {
        assert_eq!(expected, buf);
    }

+    #[test]
+    fn no_footnote_refs_without_option() {
+        let test_str = "a [^a]\n\n[^a]: yolo";
+        let expected = "<p>a <a href=\"yolo\">^a</a></p>\n";
+
+        let mut buf = String::new();
+        crate::html::push_html(&mut buf, Parser::new(test_str));
+        assert_eq!(expected, buf);
+    }
+
    #[test]
    fn ref_def_at_eof() {
        let test_str = "[test]:\\";
@ -3216,18 +3294,37 @@ mod test {
        assert_eq!(expected, buf);
    }

+    #[test]
+    fn broken_links_called_only_once() {
+        for &(markdown, expected) in &[
+            ("See also [`g()`][crate::g].", 1),
+            ("See also [`g()`][crate::g][].", 1),
+            ("[brokenlink1] some other node [brokenlink2]", 2),
+        ] {
+            let mut times_called = 0;
+            let callback = &mut |_broken_link: BrokenLink| {
+                times_called += 1;
+                None
+            };
+            let parser =
+                Parser::new_with_broken_link_callback(markdown, Options::empty(), Some(callback));
+            for _ in parser {}
+            assert_eq!(times_called, expected);
+        }
+    }
+
    #[test]
    fn simple_broken_link_callback() {
        let test_str = "This is a link w/o def: [hello][world]";
-        let parser = Parser::new_with_broken_link_callback(
-            test_str,
-            Options::empty(),
-            Some(&|norm, raw| {
-                assert_eq!("world", raw);
-                assert_eq!("world", norm);
-                Some(("YOLO".to_owned(), "SWAG".to_owned()))
-            }),
-        );
+        let mut callback = |broken_link: BrokenLink| {
+            assert_eq!("world", broken_link.reference);
+            assert_eq!(&test_str[broken_link.span], "[hello][world]");
+            let url = "YOLO".into();
+            let title = "SWAG".to_owned().into();
+            Some((url, title))
+        };
+        let parser =
+            Parser::new_with_broken_link_callback(test_str, Options::empty(), Some(&mut callback));
        let mut link_tag_count = 0;
        for (typ, url, title) in parser.filter_map(|event| match event {
            Event::Start(tag) | Event::End(tag) => match tag {
--- a/src/scanners.rs
+++ b/src/scanners.rs
@ -855,19 +855,24 @@ fn scan_attribute_name(data: &[u8]) -> Option<usize> {
    }
 }

-/// Returns byte scanned (TODO: should it return new offset?)
-// TODO: properly use the newline handler here
-fn scan_attribute(data: &[u8], newline_handler: Option<&dyn Fn(&[u8]) -> usize>) -> Option<usize> {
-    let allow_newline = newline_handler.is_some();
-    let whitespace_scanner =
-        |c| is_ascii_whitespace(c) && (allow_newline || c != b'\n' && c != b'\r');
-    let mut ix = scan_attribute_name(data)?;
-    let n_whitespace = scan_while(&data[ix..], whitespace_scanner);
+/// Returns the index immediately following the attribute on success.
+/// The argument `buffer_ix` refers to the index into `data` from which we
+/// should copy into `buffer` when we find bytes to skip.
+fn scan_attribute(
+    data: &[u8],
+    mut ix: usize,
+    newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
+    buffer: &mut Vec<u8>,
+    buffer_ix: &mut usize,
+) -> Option<usize> {
+    ix += scan_attribute_name(&data[ix..])?;
+    let n_whitespace =
+        scan_whitespace_with_newline_handler(data, ix, newline_handler, buffer, buffer_ix)? - ix;
    ix += n_whitespace;
    if scan_ch(&data[ix..], b'=') == 1 {
        ix += 1;
-        ix += scan_while(&data[ix..], whitespace_scanner);
-        ix += scan_attribute_value(&data[ix..], newline_handler)?;
+        ix = scan_whitespace_with_newline_handler(data, ix, newline_handler, buffer, buffer_ix)?;
+        ix = scan_attribute_value(&data, ix, newline_handler, buffer, buffer_ix)?;
    } else if n_whitespace > 0 {
        // Leave whitespace for next attribute.
        ix -= 1;
@ -875,12 +880,48 @@ fn scan_attribute(data: &[u8], newline_handler: Option<&dyn Fn(&[u8]) -> usize>)
    Some(ix)
 }

+/// Scans whitespace and possibly newlines according to the
+/// behavior defined by the newline handler. When bytes are skipped,
+/// all preceeding non-skipped bytes are pushed to the buffer.
+fn scan_whitespace_with_newline_handler(
+    data: &[u8],
+    mut i: usize,
+    newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
+    buffer: &mut Vec<u8>,
+    buffer_ix: &mut usize,
+) -> Option<usize> {
+    while i < data.len() {
+        if !is_ascii_whitespace(data[i]) {
+            return Some(i);
+        }
+        if let Some(eol_bytes) = scan_eol(&data[i..]) {
+            let handler = newline_handler?;
+            i += eol_bytes;
+            let skipped_bytes = handler(&data[i..]);
+
+            if skipped_bytes > 0 {
+                buffer.extend(&data[*buffer_ix..i]);
+                *buffer_ix = i + skipped_bytes;
+            }
+
+            i += skipped_bytes;
+        } else {
+            i += 1;
+        }
+    }
+
+    Some(i)
+}
+
+/// Returns the index immediately following the attribute value on success.
 fn scan_attribute_value(
    data: &[u8],
+    mut i: usize,
    newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
+    buffer: &mut Vec<u8>,
+    buffer_ix: &mut usize,
 ) -> Option<usize> {
-    let mut i = 0;
-    match *data.get(0)? {
+    match *data.get(i)? {
        b @ b'"' | b @ b'\'' => {
            i += 1;
            while i < data.len() {
@ -890,7 +931,13 @@ fn scan_attribute_value(
                if let Some(eol_bytes) = scan_eol(&data[i..]) {
                    let handler = newline_handler?;
                    i += eol_bytes;
-                    i += handler(&data[i..]);
+                    let skipped_bytes = handler(&data[i..]);
+
+                    if skipped_bytes > 0 {
+                        buffer.extend(&data[*buffer_ix..i]);
+                        *buffer_ix = i + skipped_bytes;
+                    }
+                    i += skipped_bytes;
                } else {
                    i += 1;
                }
@ -905,6 +952,7 @@ fn scan_attribute_value(
            i += scan_attr_value_chars(&data[i..]);
        }
    }
+
    Some(i)
 }

@ -975,30 +1023,36 @@ pub(crate) fn is_html_tag(tag: &[u8]) -> bool {
        .is_ok()
 }

-/// Assumes that `data` is preceded by `<`.
+/// Assumes that `data` starts with `<`.
+/// Returns the index into data directly after the html tag on success.
 pub(crate) fn scan_html_type_7(data: &[u8]) -> Option<usize> {
    // Block type html does not allow for newlines, so we
    // do not pass a newline handler.
-    let i = scan_html_block_inner(data, None)?;
+    let (_span, i) = scan_html_block_inner(data, None)?;
    scan_blank_line(&data[i..])?;
    Some(i)
 }

-// FIXME: instead of a newline handler, maybe this should receive
-// a whitespace handler instead.
-// With signature `&dyn Fn(&[u8]) -> Option<usize>`.
-// We currently need to implement whitespace handling in all of
-// this function's dependencies as well.
+/// Assumes that `data` starts with `<`.
+/// Returns the number of bytes scanned and the html in case of
+/// success.
+/// When some bytes were skipped, because the html was split over
+/// multiple leafs (e.g. over multiple lines in a blockquote),
+/// the html is returned as a vector of bytes.
+/// If no bytes were skipped, the buffer will be empty.
 pub(crate) fn scan_html_block_inner(
    data: &[u8],
    newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
-) -> Option<usize> {
-    let close_tag_bytes = scan_ch(data, b'/');
-    let l = scan_while(&data[close_tag_bytes..], is_ascii_alpha);
+) -> Option<(Vec<u8>, usize)> {
+    let mut buffer = Vec::new();
+    let mut last_buf_index = 0;
+
+    let close_tag_bytes = scan_ch(&data[1..], b'/');
+    let l = scan_while(&data[(1 + close_tag_bytes)..], is_ascii_alpha);
    if l == 0 {
        return None;
    }
-    let mut i = close_tag_bytes + l;
+    let mut i = 1 + close_tag_bytes + l;
    i += scan_while(&data[i..], is_ascii_letterdigitdash);

    if close_tag_bytes == 0 {
@ -1010,11 +1064,14 @@ pub(crate) fn scan_html_block_inner(
                    if eol_bytes == 0 {
                        return None;
                    }
-                    if let Some(handler) = newline_handler {
-                        i += eol_bytes;
-                        i += handler(&data[i..]);
-                    } else {
-                        return None;
+                    let handler = newline_handler?;
+                    i += eol_bytes;
+                    let skipped_bytes = handler(&data[i..]);
+
+                    if skipped_bytes > 0 {
+                        buffer.extend(&data[last_buf_index..i]);
+                        i += skipped_bytes;
+                        last_buf_index = i;
                    }
                } else {
                    break;
@ -1027,7 +1084,7 @@ pub(crate) fn scan_html_block_inner(
                // No whitespace, which is mandatory.
                return None;
            }
-            i += scan_attribute(&data[i..], newline_handler)?;
+            i = scan_attribute(&data, i, newline_handler, &mut buffer, &mut last_buf_index)?;
        }
    }

@ -1040,7 +1097,11 @@ pub(crate) fn scan_html_block_inner(
    if scan_ch(&data[i..], b'>') == 0 {
        None
    } else {
-        Some(i + 1)
+        i += 1;
+        if !buffer.is_empty() {
+            buffer.extend(&data[last_buf_index..i]);
+        }
+        Some((buffer, i))
    }
 }

--- a/tests/html.rs
+++ b/tests/html.rs
@ -237,7 +237,7 @@ fn html_test_broken_callback() {

    let mut s = String::new();

-    let callback = |broken_link: BrokenLink| {
+    let mut callback = |broken_link: BrokenLink| {
        if broken_link.reference == "foo" || broken_link.reference == "baz" {
            Some(("https://replaced.example.org".into(), "some title".into()))
        } else {
--- a/tests/suite/regression.rs
+++ b/tests/suite/regression.rs
@ -461,6 +461,21 @@ fn regression_test_33() {

 #[test]
 fn regression_test_34() {
+    let original = r##"> [foo
+> bar]: /url
+>
+> [foo bar]
+"##;
+    let expected = r##"<blockquote>
+<p><a href="/url">foo bar</a></p>
+</blockquote>
+"##;
+
+    test_markdown_html(original, expected, false);
+}
+
+#[test]
+fn regression_test_35() {
    let original = r##"> foo | bar
 > --- | ---
 yolo | swag
@ -475,7 +490,7 @@ yolo | swag
 }

 #[test]
-fn regression_test_35() {
+fn regression_test_36() {
    let original = r##"<foo bar>
 "##;
    let expected = r##"<foo bar>
@ -485,7 +500,7 @@ fn regression_test_35() {
 }

 #[test]
-fn regression_test_36() {
+fn regression_test_37() {
    let original = r##"<foo bar =
 "hi"> 
 "##;
@ -497,7 +512,7 @@ fn regression_test_36() {
 }

 #[test]
-fn regression_test_37() {
+fn regression_test_38() {
    let original = r##"~~*_**__

 __a__
@ -510,7 +525,7 @@ __a__
 }

 #[test]
-fn regression_test_38() {
+fn regression_test_39() {
    let original = r##"> `
 > `
 "##;
@ -523,7 +538,7 @@ fn regression_test_38() {
 }

 #[test]
-fn regression_test_39() {
+fn regression_test_40() {
    let original = r##"`\|`
 "##;
    let expected = r##"<p><code>\|</code></p>
@ -533,7 +548,7 @@ fn regression_test_39() {
 }

 #[test]
-fn regression_test_40() {
+fn regression_test_41() {
    let original = r##"Paragraph 1
    
 Paragraph 2
@ -546,7 +561,7 @@ Paragraph 2
 }

 #[test]
-fn regression_test_41() {
+fn regression_test_42() {
    let original = r##"\[[link text](https://www.google.com/)\]
 "##;
    let expected = r##"<p>[<a href="https://www.google.com/">link text</a>]</p>
@ -556,7 +571,7 @@ fn regression_test_41() {
 }

 #[test]
-fn regression_test_42() {
+fn regression_test_43() {
    let original = r##"foo | bar
 --- | ---
 [a](< | url>)
@ -568,7 +583,7 @@ fn regression_test_42() {
 }

 #[test]
-fn regression_test_43() {
+fn regression_test_44() {
    let original = r##"[a](url "
 - - -
 ")
@ -582,7 +597,7 @@ fn regression_test_43() {
 }

 #[test]
-fn regression_test_44() {
+fn regression_test_45() {
    let original = r##"[a](url

 )
@ -595,7 +610,7 @@ fn regression_test_44() {
 }

 #[test]
-fn regression_test_45() {
+fn regression_test_46() {
    let original = r##"[a](b "

 ")
@ -608,7 +623,7 @@ fn regression_test_45() {
 }

 #[test]
-fn regression_test_46() {
+fn regression_test_47() {
    let original = r##"<http:// >
 "##;
    let expected = r##"<p>&lt;http:// &gt;</p>
@ -618,7 +633,7 @@ fn regression_test_46() {
 }

 #[test]
-fn regression_test_47() {
+fn regression_test_48() {
    let original = r##"<http://>
 "##;
    let expected = r##"<p>&lt;http://&gt;</p>
@ -628,7 +643,7 @@ fn regression_test_47() {
 }

 #[test]
-fn regression_test_48() {
+fn regression_test_49() {
    let original = r##"foo | bar
 --- | ---
 <http://| baz
@ -647,7 +662,7 @@ fn regression_test_48() {
 }

 #[test]
-fn regression_test_49() {
+fn regression_test_50() {
    let original = r##"foo | bar
 --- | ---
 <http://|>
@ -666,7 +681,7 @@ fn regression_test_49() {
 }

 #[test]
-fn regression_test_50() {
+fn regression_test_51() {
    let original = r##"<sup>\*hi</sup>\_
 "##;
    let expected = r##"<p><sup>*hi</sup>_</p>
@ -676,7 +691,7 @@ fn regression_test_50() {
 }

 #[test]
-fn regression_test_51() {
+fn regression_test_52() {
    let original = r##"email: <john@example.com>\_
 "##;
    let expected = r##"<p>email: <a href="mailto:john@example.com">john@example.com</a>_</p>
@ -686,7 +701,7 @@ fn regression_test_51() {
 }

 #[test]
-fn regression_test_52() {
+fn regression_test_53() {
    let original = r##"> [link](/url 'foo
 > bar')
 "##;
@ -700,7 +715,7 @@ bar">link</a></p>
 }

 #[test]
-fn regression_test_53() {
+fn regression_test_54() {
    let original = r##"> [foo
 > bar]: /url
 >
@ -715,7 +730,7 @@ fn regression_test_53() {
 }

 #[test]
-fn regression_test_54() {
+fn regression_test_55() {
    let original = r##"> [foo   bar]: /url
 >
 > [foo
@ -731,7 +746,7 @@ bar</a></p>
 }

 #[test]
-fn regression_test_55() {
+fn regression_test_56() {
    let original = r##"> - [a
 > b c]: /foo

@ -749,7 +764,7 @@ fn regression_test_55() {
 }

 #[test]
-fn regression_test_56() {
+fn regression_test_57() {
    let original = r##"[a
 > b]: /foo

@ -766,7 +781,7 @@ fn regression_test_56() {
 }

 #[test]
-fn regression_test_57() {
+fn regression_test_58() {
    let original = r##"[`cargo
 package`]

@ -779,7 +794,7 @@ package`]
 }

 #[test]
-fn regression_test_58() {
+fn regression_test_59() {
    let original = r##"> [`cargo
 > package`]

@ -794,7 +809,7 @@ fn regression_test_58() {
 }

 #[test]
-fn regression_test_59() {
+fn regression_test_60() {
    let original = r##"> `cargo
 > package`
 "##;
@ -806,7 +821,23 @@ fn regression_test_59() {
 }

 #[test]
-fn regression_test_60() {
+fn regression_test_61() {
+    let original = r##"> Note: Though you should not rely on this, all pointers to <abbr
+> title="Dynamically Sized Types">DSTs</abbr> are currently twice the size of
+> the size of `usize` and have the same alignment.
+"##;
+    let expected = r##"<blockquote>
+<p>Note: Though you should not rely on this, all pointers to
+<abbr title="Dynamically Sized Types">DSTs</abbr> are currently twice the size of
+the size of <code>usize</code> and have the same alignment.</p>
+</blockquote>
+"##;
+
+    test_markdown_html(original, expected, false);
+}
+
+#[test]
+fn regression_test_62() {
    let original = r##"Lorem ipsum.[^a]

 An unordered list before the footnotes:
@ -830,7 +861,7 @@ An unordered list before the footnotes:
 }

 #[test]
-fn regression_test_61() {
+fn regression_test_63() {
    let original = r##"[][a]

 [a]: b
@ -849,7 +880,7 @@ fn regression_test_61() {
 }

 #[test]
-fn regression_test_62() {
+fn regression_test_64() {
    let original = r##"* A list.

   * A sublist.
@ -894,7 +925,7 @@ fn regression_test_62() {
 }

 #[test]
-fn regression_test_63() {
+fn regression_test_65() {
    let original = r##"<foo
 "##;
    let expected = r##"<p>&lt;foo</p>
@ -902,3 +933,21 @@ fn regression_test_63() {

    test_markdown_html(original, expected, false);
 }
+
+#[test]
+fn regression_test_66() {
+    let original = r##"> > a <a href
+> > ="yo
+> > lo">
+"##;
+    let expected = r##"<blockquote>
+<blockquote>
+<p>a <a href
+="yo
+lo"></p>
+</blockquote>
+</blockquote>
+"##;
+
+    test_markdown_html(original, expected, false);
+}