Merge branch 'master' into dynamic-LUT

bidi-plume
Marcus Klaas de Vries 4 years ago
commit 7277fb5171

@ -0,0 +1,37 @@
extern crate pulldown_cmark;
use pulldown_cmark::{html, BrokenLink, Options, Parser};
fn main() {
let input: &str = "Hello world, check out [my website][].";
println!("Parsing the following markdown string:\n{}", input);
// Setup callback that sets the URL and title when it encounters
// a reference to our home page.
let callback = &mut |broken_link: BrokenLink| {
if broken_link.reference == "my website" {
println!(
"Replacing the markdown `{}` of type {:?} with a working link",
&input[broken_link.span], broken_link.link_type,
);
Some(("http://example.com".into(), "my example website".into()))
} else {
None
}
};
// Create a parser with our callback function for broken links.
let parser = Parser::new_with_broken_link_callback(input, Options::empty(), Some(callback));
// Write to String buffer.
let mut html_output: String = String::with_capacity(input.len() * 3 / 2);
html::push_html(&mut html_output, parser);
// Check that the output is what we expected.
let expected_html: &str =
"<p>Hello world, check out <a href=\"http://example.com\" title=\"my example website\">my website</a>.</p>\n";
assert_eq!(expected_html, &html_output);
// Write result to stdout.
println!("\nHTML output:\n{}", &html_output);
}

@ -388,9 +388,9 @@ ISSUE #295
<p>[foo]:</p>
````````````````````````````````
ISSUE #298 (not yet fixed)
ISSUE #298
```````````````````````````````` DISABLED example
```````````````````````````````` example
> [foo
> bar]: /url
>
@ -697,7 +697,7 @@ ISSUE 398
ISSUE 399
```````````````````````````````` DISABLED example
```````````````````````````````` example
> Note: Though you should not rely on this, all pointers to <abbr
> title="Dynamically Sized Types">DSTs</abbr> are currently twice the size of
> the size of `usize` and have the same alignment.
@ -798,3 +798,19 @@ ISSUE 437
.
<p>&lt;foo</p>
````````````````````````````````
Inline HTML stress test
```````````````````````````````` example
> > a <a href
> > ="yo
> > lo">
.
<blockquote>
<blockquote>
<p>a <a href
="yo
lo"></p>
</blockquote>
</blockquote>
````````````````````````````````

@ -18,13 +18,13 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//! Utility functions for HTML escaping
//! Utility functions for HTML escaping. Only useful when building your own
//! HTML renderer.
use std::io;
use std::fmt::{Arguments, Write as FmtWrite};
use std::io::{self, ErrorKind, Write};
use std::str::from_utf8;
use crate::html::StrWrite;
#[rustfmt::skip]
static HREF_SAFE: [u8; 128] = [
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -41,7 +41,66 @@ static HEX_CHARS: &[u8] = b"0123456789ABCDEF";
static AMP_ESCAPE: &str = "&amp;";
static SLASH_ESCAPE: &str = "&#x27;";
pub(crate) fn escape_href<W>(mut w: W, s: &str) -> io::Result<()>
/// This wrapper exists because we can't have both a blanket implementation
/// for all types implementing `Write` and types of the for `&mut W` where
/// `W: StrWrite`. Since we need the latter a lot, we choose to wrap
/// `Write` types.
pub struct WriteWrapper<W>(pub W);
/// Trait that allows writing string slices. This is basically an extension
/// of `std::io::Write` in order to include `String`.
pub trait StrWrite {
fn write_str(&mut self, s: &str) -> io::Result<()>;
fn write_fmt(&mut self, args: Arguments) -> io::Result<()>;
}
impl<W> StrWrite for WriteWrapper<W>
where
W: Write,
{
#[inline]
fn write_str(&mut self, s: &str) -> io::Result<()> {
self.0.write_all(s.as_bytes())
}
#[inline]
fn write_fmt(&mut self, args: Arguments) -> io::Result<()> {
self.0.write_fmt(args)
}
}
impl<'w> StrWrite for String {
#[inline]
fn write_str(&mut self, s: &str) -> io::Result<()> {
self.push_str(s);
Ok(())
}
#[inline]
fn write_fmt(&mut self, args: Arguments) -> io::Result<()> {
// FIXME: translate fmt error to io error?
FmtWrite::write_fmt(self, args).map_err(|_| ErrorKind::Other.into())
}
}
impl<W> StrWrite for &'_ mut W
where
W: StrWrite,
{
#[inline]
fn write_str(&mut self, s: &str) -> io::Result<()> {
(**self).write_str(s)
}
#[inline]
fn write_fmt(&mut self, args: Arguments) -> io::Result<()> {
(**self).write_fmt(args)
}
}
/// Writes an href to the buffer, escaping href unsafe bytes.
pub fn escape_href<W>(mut w: W, s: &str) -> io::Result<()>
where
W: StrWrite,
{
@ -93,7 +152,7 @@ static HTML_ESCAPES: [&'static str; 5] = ["", "&quot;", "&amp;", "&lt;", "&gt;"]
/// Writes the given string to the Write sink, replacing special HTML bytes
/// (<, >, &, ") by escape sequences.
pub(crate) fn escape_html<W: StrWrite>(w: W, s: &str) -> io::Result<()> {
pub fn escape_html<W: StrWrite>(w: W, s: &str) -> io::Result<()> {
#[cfg(all(target_arch = "x86_64", feature = "simd"))]
{
simd::escape_html(w, s)
@ -131,7 +190,7 @@ fn escape_html_scalar<W: StrWrite>(mut w: W, s: &str) -> io::Result<()> {
#[cfg(all(target_arch = "x86_64", feature = "simd"))]
mod simd {
use crate::html::StrWrite;
use super::StrWrite;
use std::arch::x86_64::*;
use std::io;
use std::mem::size_of;

@ -21,10 +21,9 @@
//! HTML renderer that takes an iterator of events as input.
use std::collections::HashMap;
use std::fmt::{Arguments, Write as FmtWrite};
use std::io::{self, ErrorKind, Write};
use std::io::{self, Write};
use crate::escape::{escape_href, escape_html};
use crate::escape::{escape_href, escape_html, StrWrite, WriteWrapper};
use crate::parse::Event::*;
use crate::parse::{Alignment, CodeBlockKind, Event, LinkType, Tag};
use crate::strings::CowStr;
@ -34,64 +33,6 @@ enum TableState {
Body,
}
/// This wrapper exists because we can't have both a blanket implementation
/// for all types implementing `Write` and types of the for `&mut W` where
/// `W: StrWrite`. Since we need the latter a lot, we choose to wrap
/// `Write` types.
struct WriteWrapper<W>(W);
/// Trait that allows writing string slices. This is basically an extension
/// of `std::io::Write` in order to include `String`.
pub(crate) trait StrWrite {
fn write_str(&mut self, s: &str) -> io::Result<()>;
fn write_fmt(&mut self, args: Arguments) -> io::Result<()>;
}
impl<W> StrWrite for WriteWrapper<W>
where
W: Write,
{
#[inline]
fn write_str(&mut self, s: &str) -> io::Result<()> {
self.0.write_all(s.as_bytes())
}
#[inline]
fn write_fmt(&mut self, args: Arguments) -> io::Result<()> {
self.0.write_fmt(args)
}
}
impl<'w> StrWrite for String {
#[inline]
fn write_str(&mut self, s: &str) -> io::Result<()> {
self.push_str(s);
Ok(())
}
#[inline]
fn write_fmt(&mut self, args: Arguments) -> io::Result<()> {
// FIXME: translate fmt error to io error?
FmtWrite::write_fmt(self, args).map_err(|_| ErrorKind::Other.into())
}
}
impl<W> StrWrite for &'_ mut W
where
W: StrWrite,
{
#[inline]
fn write_str(&mut self, s: &str) -> io::Result<()> {
(**self).write_str(s)
}
#[inline]
fn write_fmt(&mut self, args: Arguments) -> io::Result<()> {
(**self).write_fmt(args)
}
}
struct HtmlWriter<'a, I, W> {
/// Iterator supplying events.
iter: I,

@ -59,7 +59,7 @@ extern crate bitflags;
extern crate unicase;
mod entities;
mod escape;
pub mod escape;
mod linklabel;
mod parse;
mod puncttable;
@ -71,6 +71,6 @@ mod tree;
mod simd;
pub use crate::parse::{
Alignment, CodeBlockKind, Event, LinkType, OffsetIter, Options, Parser, Tag,
Alignment, BrokenLink, CodeBlockKind, Event, LinkType, OffsetIter, Options, Parser, Tag,
};
pub use crate::strings::{CowStr, InlineStr};

@ -215,7 +215,8 @@ enum ItemBody {
MaybeCode(usize, bool), // number of backticks, preceeded by backslash
MaybeHtml,
MaybeLinkOpen,
MaybeLinkClose,
// bool indicates whether or not the preceeding section could be a reference
MaybeLinkClose(bool),
MaybeImage,
// These are inline items after resolution.
@ -233,6 +234,7 @@ enum ItemBody {
FencedCodeBlock(CowIndex),
IndentCodeBlock,
Html,
OwnedHtml(CowIndex),
BlockQuote,
List(bool, u8, u64), // is_tight, list character, list start index
ListItem(usize), // indent level
@ -258,7 +260,7 @@ impl<'a> ItemBody {
| ItemBody::MaybeHtml
| ItemBody::MaybeCode(..)
| ItemBody::MaybeLinkOpen
| ItemBody::MaybeLinkClose
| ItemBody::MaybeLinkClose(..)
| ItemBody::MaybeImage => true,
_ => false,
}
@ -282,6 +284,12 @@ enum TableParseMode {
Disabled,
}
pub struct BrokenLink<'a> {
pub span: std::ops::Range<usize>,
pub link_type: LinkType,
pub reference: &'a str,
}
/// State for the first parsing pass.
///
/// The first pass resolves all block structure, generating an AST. Within a block, items
@ -443,7 +451,7 @@ impl<'a, 'b> FirstPass<'a, 'b> {
}
// Detect type 7
if let Some(_html_bytes) = scan_html_type_7(&bytes[(ix + 1)..]) {
if let Some(_html_bytes) = scan_html_type_7(&bytes[ix..]) {
return self.parse_html_block_type_6_or_7(ix, remaining_space);
}
}
@ -851,7 +859,7 @@ impl<'a, 'b> FirstPass<'a, 'b> {
self.tree.append(Item {
start: ix,
end: ix + 1,
body: ItemBody::MaybeLinkClose,
body: ItemBody::MaybeLinkClose(true),
});
begin_text = ix + 1;
LoopInstruction::ContinueAndSkip(0)
@ -1743,8 +1751,8 @@ impl InlineStack {
#[derive(Debug, Clone)]
enum RefScan<'a> {
// label, next node index, source ix of label end
LinkLabel(CowStr<'a>, Option<TreeIndex>, usize),
// label, source ix of label end
LinkLabel(CowStr<'a>, usize),
// contains next node index
Collapsed(Option<TreeIndex>),
Failed,
@ -1772,6 +1780,7 @@ fn scan_nodes_to_ix(
fn scan_link_label<'text, 'tree>(
tree: &'tree Tree<Item>,
text: &'text str,
allow_footnote_refs: bool,
) -> Option<(usize, ReferenceLabel<'text>)> {
let bytes = &text.as_bytes();
if bytes.len() < 2 || bytes[0] != b'[' {
@ -1782,7 +1791,7 @@ fn scan_link_label<'text, 'tree>(
let _ = scan_containers(tree, &mut line_start);
Some(line_start.bytes_scanned())
};
let pair = if b'^' == bytes[1] {
let pair = if allow_footnote_refs && b'^' == bytes[1] {
let (byte_index, cow) = scan_link_label_rest(&text[2..], &linebreak_handler)?;
(byte_index + 2, ReferenceLabel::Footnote(cow))
} else {
@ -1796,6 +1805,7 @@ fn scan_reference<'a, 'b>(
tree: &'a Tree<Item>,
text: &'b str,
cur: Option<TreeIndex>,
allow_footnote_refs: bool,
) -> RefScan<'b> {
let cur_ix = match cur {
None => return RefScan::Failed,
@ -1807,9 +1817,10 @@ fn scan_reference<'a, 'b>(
if tail.starts_with(b"[]") {
let closing_node = tree[cur_ix].next.unwrap();
RefScan::Collapsed(tree[closing_node].next)
} else if let Some((ix, ReferenceLabel::Link(label))) = scan_link_label(tree, &text[start..]) {
let next_node = scan_nodes_to_ix(tree, cur, start + ix);
RefScan::LinkLabel(label, next_node, start + ix)
} else if let Some((ix, ReferenceLabel::Link(label))) =
scan_link_label(tree, &text[start..], allow_footnote_refs)
{
RefScan::LinkLabel(label, start + ix)
} else {
RefScan::Failed
}
@ -2033,13 +2044,16 @@ pub(crate) fn create_lut(options: &Options) -> LookupTable {
}
}
pub type BrokenLinkCallback<'a> =
Option<&'a mut dyn FnMut(BrokenLink) -> Option<(CowStr<'a>, CowStr<'a>)>>;
/// Markdown event iterator.
#[derive(Clone)]
pub struct Parser<'a> {
text: &'a str,
options: Options,
tree: Tree<Item>,
allocs: Allocations<'a>,
broken_link_callback: Option<&'a dyn Fn(&str, &str) -> Option<(String, String)>>,
broken_link_callback: BrokenLinkCallback<'a>,
html_scan_guard: HtmlScanGuard,
// used by inline passes. store them here for reuse
@ -2066,7 +2080,7 @@ impl<'a> Parser<'a> {
pub fn new_with_broken_link_callback(
text: &'a str,
options: Options,
broken_link_callback: Option<&'a dyn Fn(&str, &str) -> Option<(String, String)>>,
broken_link_callback: BrokenLinkCallback<'a>,
) -> Parser<'a> {
let lut = create_lut(&options);
let first_pass = FirstPass::new(text, options, &lut);
@ -2077,6 +2091,7 @@ impl<'a> Parser<'a> {
let html_scan_guard = Default::default();
Parser {
text,
options,
tree,
allocs,
broken_link_callback,
@ -2139,17 +2154,23 @@ impl<'a> Parser<'a> {
}
continue;
} else {
let inline_html = if let Some(next_ix) = next {
let inline_html = next.and_then(|next_ix| {
self.scan_inline_html(
block_text.as_bytes(),
self.tree[next_ix].item.start,
)
} else {
None
};
if let Some(ix) = inline_html {
});
if let Some((span, ix)) = inline_html {
let node = scan_nodes_to_ix(&self.tree, next, ix);
self.tree[cur_ix].item.body = ItemBody::Html;
self.tree[cur_ix].item.body = if !span.is_empty() {
let converted_string =
String::from_utf8(span).expect("invalid utf8");
ItemBody::OwnedHtml(
self.allocs.allocate_cow(converted_string.into()),
)
} else {
ItemBody::Html
};
self.tree[cur_ix].item.end = ix;
self.tree[cur_ix].next = node;
prev = cur;
@ -2223,7 +2244,7 @@ impl<'a> Parser<'a> {
ty: LinkStackTy::Image,
});
}
ItemBody::MaybeLinkClose => {
ItemBody::MaybeLinkClose(could_be_ref) => {
self.tree[cur_ix].item.body = ItemBody::Text;
if let Some(tos) = self.link_stack.pop() {
if tos.ty == LinkStackTy::Disabled {
@ -2259,23 +2280,53 @@ impl<'a> Parser<'a> {
} else {
// ok, so its not an inline link. maybe it is a reference
// to a defined link?
let scan_result = scan_reference(&self.tree, block_text, next);
let scan_result = scan_reference(
&self.tree,
block_text,
next,
self.options.contains(Options::ENABLE_FOOTNOTES),
);
let (node_after_link, link_type) = match scan_result {
// [label][reference]
RefScan::LinkLabel(_, next_node, _) => {
RefScan::LinkLabel(_, end_ix) => {
// Toggle reference viability of the last closing bracket,
// so that we can skip it on future iterations in case
// it fails in this one. In particular, we won't call
// the broken link callback twice on one reference.
let reference_close_node =
scan_nodes_to_ix(&self.tree, next, end_ix - 1).unwrap();
self.tree[reference_close_node].item.body =
ItemBody::MaybeLinkClose(false);
let next_node = self.tree[reference_close_node].next;
(next_node, LinkType::Reference)
}
// []
RefScan::Collapsed(next_node) => (next_node, LinkType::Collapsed),
// [reference][]
RefScan::Collapsed(next_node) => {
// This reference has already been tried, and it's not
// valid. Skip it.
if !could_be_ref {
continue;
}
(next_node, LinkType::Collapsed)
}
// [shortcut]
//
// [shortcut]: /blah
RefScan::Failed => (next, LinkType::Shortcut),
RefScan::Failed => {
if !could_be_ref {
continue;
}
(next, LinkType::Shortcut)
}
};
// FIXME: references and labels are mixed in the naming of variables
// below. Disambiguate!
// (label, source_ix end)
let label: Option<(ReferenceLabel<'a>, usize)> = match scan_result {
RefScan::LinkLabel(l, _, end_ix) => {
RefScan::LinkLabel(l, end_ix) => {
Some((ReferenceLabel::Link(l), end_ix))
}
RefScan::Collapsed(..) | RefScan::Failed => {
@ -2284,6 +2335,7 @@ impl<'a> Parser<'a> {
scan_link_label(
&self.tree,
&self.text[label_start..self.tree[cur_ix].item.end],
self.options.contains(Options::ENABLE_FOOTNOTES),
)
.map(|(ix, label)| (label, label_start + ix))
}
@ -2316,15 +2368,21 @@ impl<'a> Parser<'a> {
(link_type, url, title)
})
.or_else(|| {
self.broken_link_callback
.and_then(|callback| {
// looked for matching definition, but didn't find it. try to fix
// link with callback, if it is defined
callback(link_label.as_ref(), link_label.as_ref())
})
.map(|(url, title)| {
(link_type.to_unknown(), url.into(), title.into())
})
match self.broken_link_callback.as_mut() {
Some(callback) => {
// Construct a BrokenLink struct, which will be passed to the callback
let broken_link = BrokenLink {
span: (self.tree[tos.node].item.start)..end,
link_type: link_type,
reference: link_label.as_ref(),
};
callback(broken_link).map(|(url, title)| {
(link_type.to_unknown(), url, title)
})
}
None => None,
}
});
if let Some((def_link_type, url, title)) = type_url_title {
@ -2685,23 +2743,32 @@ impl<'a> Parser<'a> {
}
}
/// Returns the next byte offset on success.
fn scan_inline_html(&mut self, bytes: &[u8], ix: usize) -> Option<usize> {
/// On success, returns a buffer containing the inline html and byte offset.
/// When no bytes were skipped, the buffer will be empty and the html can be
/// represented as a subslice of the input string.
fn scan_inline_html(&mut self, bytes: &[u8], ix: usize) -> Option<(Vec<u8>, usize)> {
let c = *bytes.get(ix)?;
if c == b'!' {
scan_inline_html_comment(bytes, ix + 1, &mut self.html_scan_guard)
Some((
vec![],
scan_inline_html_comment(bytes, ix + 1, &mut self.html_scan_guard)?,
))
} else if c == b'?' {
scan_inline_html_processing(bytes, ix + 1, &mut self.html_scan_guard)
Some((
vec![],
scan_inline_html_processing(bytes, ix + 1, &mut self.html_scan_guard)?,
))
} else {
let i = scan_html_block_inner(
&bytes[ix..],
let (span, i) = scan_html_block_inner(
// Subtract 1 to include the < character
&bytes[(ix - 1)..],
Some(&|_bytes| {
let mut line_start = LineStart::new(bytes);
let _ = scan_containers(&self.tree, &mut line_start);
line_start.bytes_scanned()
}),
)?;
Some(i + ix)
Some((span, i + ix - 1))
}
}
@ -2873,6 +2940,7 @@ fn item_to_event<'a>(item: Item, text: &'a str, allocs: &Allocations<'a>) -> Eve
ItemBody::SynthesizeText(cow_ix) => return Event::Text(allocs[cow_ix].clone()),
ItemBody::SynthesizeChar(c) => return Event::Text(c.into()),
ItemBody::Html => return Event::Html(text[item.start..item.end].into()),
ItemBody::OwnedHtml(cow_ix) => return Event::Html(allocs[cow_ix].clone()),
ItemBody::SoftBreak => return Event::SoftBreak,
ItemBody::HardBreak => return Event::HardBreak,
ItemBody::FootnoteReference(cow_ix) => {
@ -3132,7 +3200,7 @@ mod test {
#[test]
fn footnote_offsets() {
let range = Parser::new("Testing this[^1] out.\n\n[^1]: Footnote.")
let range = parser_with_extensions("Testing this[^1] out.\n\n[^1]: Footnote.")
.into_offset_iter()
.filter_map(|(ev, range)| match ev {
Event::FootnoteReference(..) => Some(range),
@ -3186,6 +3254,16 @@ mod test {
assert_eq!(expected, buf);
}
#[test]
fn no_footnote_refs_without_option() {
let test_str = "a [^a]\n\n[^a]: yolo";
let expected = "<p>a <a href=\"yolo\">^a</a></p>\n";
let mut buf = String::new();
crate::html::push_html(&mut buf, Parser::new(test_str));
assert_eq!(expected, buf);
}
#[test]
fn ref_def_at_eof() {
let test_str = "[test]:\\";
@ -3216,18 +3294,37 @@ mod test {
assert_eq!(expected, buf);
}
#[test]
fn broken_links_called_only_once() {
for &(markdown, expected) in &[
("See also [`g()`][crate::g].", 1),
("See also [`g()`][crate::g][].", 1),
("[brokenlink1] some other node [brokenlink2]", 2),
] {
let mut times_called = 0;
let callback = &mut |_broken_link: BrokenLink| {
times_called += 1;
None
};
let parser =
Parser::new_with_broken_link_callback(markdown, Options::empty(), Some(callback));
for _ in parser {}
assert_eq!(times_called, expected);
}
}
#[test]
fn simple_broken_link_callback() {
let test_str = "This is a link w/o def: [hello][world]";
let parser = Parser::new_with_broken_link_callback(
test_str,
Options::empty(),
Some(&|norm, raw| {
assert_eq!("world", raw);
assert_eq!("world", norm);
Some(("YOLO".to_owned(), "SWAG".to_owned()))
}),
);
let mut callback = |broken_link: BrokenLink| {
assert_eq!("world", broken_link.reference);
assert_eq!(&test_str[broken_link.span], "[hello][world]");
let url = "YOLO".into();
let title = "SWAG".to_owned().into();
Some((url, title))
};
let parser =
Parser::new_with_broken_link_callback(test_str, Options::empty(), Some(&mut callback));
let mut link_tag_count = 0;
for (typ, url, title) in parser.filter_map(|event| match event {
Event::Start(tag) | Event::End(tag) => match tag {

@ -855,19 +855,24 @@ fn scan_attribute_name(data: &[u8]) -> Option<usize> {
}
}
/// Returns byte scanned (TODO: should it return new offset?)
// TODO: properly use the newline handler here
fn scan_attribute(data: &[u8], newline_handler: Option<&dyn Fn(&[u8]) -> usize>) -> Option<usize> {
let allow_newline = newline_handler.is_some();
let whitespace_scanner =
|c| is_ascii_whitespace(c) && (allow_newline || c != b'\n' && c != b'\r');
let mut ix = scan_attribute_name(data)?;
let n_whitespace = scan_while(&data[ix..], whitespace_scanner);
/// Returns the index immediately following the attribute on success.
/// The argument `buffer_ix` refers to the index into `data` from which we
/// should copy into `buffer` when we find bytes to skip.
fn scan_attribute(
data: &[u8],
mut ix: usize,
newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
buffer: &mut Vec<u8>,
buffer_ix: &mut usize,
) -> Option<usize> {
ix += scan_attribute_name(&data[ix..])?;
let n_whitespace =
scan_whitespace_with_newline_handler(data, ix, newline_handler, buffer, buffer_ix)? - ix;
ix += n_whitespace;
if scan_ch(&data[ix..], b'=') == 1 {
ix += 1;
ix += scan_while(&data[ix..], whitespace_scanner);
ix += scan_attribute_value(&data[ix..], newline_handler)?;
ix = scan_whitespace_with_newline_handler(data, ix, newline_handler, buffer, buffer_ix)?;
ix = scan_attribute_value(&data, ix, newline_handler, buffer, buffer_ix)?;
} else if n_whitespace > 0 {
// Leave whitespace for next attribute.
ix -= 1;
@ -875,12 +880,48 @@ fn scan_attribute(data: &[u8], newline_handler: Option<&dyn Fn(&[u8]) -> usize>)
Some(ix)
}
/// Scans whitespace and possibly newlines according to the
/// behavior defined by the newline handler. When bytes are skipped,
/// all preceeding non-skipped bytes are pushed to the buffer.
fn scan_whitespace_with_newline_handler(
data: &[u8],
mut i: usize,
newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
buffer: &mut Vec<u8>,
buffer_ix: &mut usize,
) -> Option<usize> {
while i < data.len() {
if !is_ascii_whitespace(data[i]) {
return Some(i);
}
if let Some(eol_bytes) = scan_eol(&data[i..]) {
let handler = newline_handler?;
i += eol_bytes;
let skipped_bytes = handler(&data[i..]);
if skipped_bytes > 0 {
buffer.extend(&data[*buffer_ix..i]);
*buffer_ix = i + skipped_bytes;
}
i += skipped_bytes;
} else {
i += 1;
}
}
Some(i)
}
/// Returns the index immediately following the attribute value on success.
fn scan_attribute_value(
data: &[u8],
mut i: usize,
newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
buffer: &mut Vec<u8>,
buffer_ix: &mut usize,
) -> Option<usize> {
let mut i = 0;
match *data.get(0)? {
match *data.get(i)? {
b @ b'"' | b @ b'\'' => {
i += 1;
while i < data.len() {
@ -890,7 +931,13 @@ fn scan_attribute_value(
if let Some(eol_bytes) = scan_eol(&data[i..]) {
let handler = newline_handler?;
i += eol_bytes;
i += handler(&data[i..]);
let skipped_bytes = handler(&data[i..]);
if skipped_bytes > 0 {
buffer.extend(&data[*buffer_ix..i]);
*buffer_ix = i + skipped_bytes;
}
i += skipped_bytes;
} else {
i += 1;
}
@ -905,6 +952,7 @@ fn scan_attribute_value(
i += scan_attr_value_chars(&data[i..]);
}
}
Some(i)
}
@ -975,30 +1023,36 @@ pub(crate) fn is_html_tag(tag: &[u8]) -> bool {
.is_ok()
}
/// Assumes that `data` is preceded by `<`.
/// Assumes that `data` starts with `<`.
/// Returns the index into data directly after the html tag on success.
pub(crate) fn scan_html_type_7(data: &[u8]) -> Option<usize> {
// Block type html does not allow for newlines, so we
// do not pass a newline handler.
let i = scan_html_block_inner(data, None)?;
let (_span, i) = scan_html_block_inner(data, None)?;
scan_blank_line(&data[i..])?;
Some(i)
}
// FIXME: instead of a newline handler, maybe this should receive
// a whitespace handler instead.
// With signature `&dyn Fn(&[u8]) -> Option<usize>`.
// We currently need to implement whitespace handling in all of
// this function's dependencies as well.
/// Assumes that `data` starts with `<`.
/// Returns the number of bytes scanned and the html in case of
/// success.
/// When some bytes were skipped, because the html was split over
/// multiple leafs (e.g. over multiple lines in a blockquote),
/// the html is returned as a vector of bytes.
/// If no bytes were skipped, the buffer will be empty.
pub(crate) fn scan_html_block_inner(
data: &[u8],
newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
) -> Option<usize> {
let close_tag_bytes = scan_ch(data, b'/');
let l = scan_while(&data[close_tag_bytes..], is_ascii_alpha);
) -> Option<(Vec<u8>, usize)> {
let mut buffer = Vec::new();
let mut last_buf_index = 0;
let close_tag_bytes = scan_ch(&data[1..], b'/');
let l = scan_while(&data[(1 + close_tag_bytes)..], is_ascii_alpha);
if l == 0 {
return None;
}
let mut i = close_tag_bytes + l;
let mut i = 1 + close_tag_bytes + l;
i += scan_while(&data[i..], is_ascii_letterdigitdash);
if close_tag_bytes == 0 {
@ -1010,11 +1064,14 @@ pub(crate) fn scan_html_block_inner(
if eol_bytes == 0 {
return None;
}
if let Some(handler) = newline_handler {
i += eol_bytes;
i += handler(&data[i..]);
} else {
return None;
let handler = newline_handler?;
i += eol_bytes;
let skipped_bytes = handler(&data[i..]);
if skipped_bytes > 0 {
buffer.extend(&data[last_buf_index..i]);
i += skipped_bytes;
last_buf_index = i;
}
} else {
break;
@ -1027,7 +1084,7 @@ pub(crate) fn scan_html_block_inner(
// No whitespace, which is mandatory.
return None;
}
i += scan_attribute(&data[i..], newline_handler)?;
i = scan_attribute(&data, i, newline_handler, &mut buffer, &mut last_buf_index)?;
}
}
@ -1040,7 +1097,11 @@ pub(crate) fn scan_html_block_inner(
if scan_ch(&data[i..], b'>') == 0 {
None
} else {
Some(i + 1)
i += 1;
if !buffer.is_empty() {
buffer.extend(&data[last_buf_index..i]);
}
Some((buffer, i))
}
}

@ -237,7 +237,7 @@ fn html_test_broken_callback() {
let mut s = String::new();
let callback = |broken_link: BrokenLink| {
let mut callback = |broken_link: BrokenLink| {
if broken_link.reference == "foo" || broken_link.reference == "baz" {
Some(("https://replaced.example.org".into(), "some title".into()))
} else {

@ -461,6 +461,21 @@ fn regression_test_33() {
#[test]
fn regression_test_34() {
let original = r##"> [foo
> bar]: /url
>
> [foo bar]
"##;
let expected = r##"<blockquote>
<p><a href="/url">foo bar</a></p>
</blockquote>
"##;
test_markdown_html(original, expected, false);
}
#[test]
fn regression_test_35() {
let original = r##"> foo | bar
> --- | ---
yolo | swag
@ -475,7 +490,7 @@ yolo | swag
}
#[test]
fn regression_test_35() {
fn regression_test_36() {
let original = r##"<foo bar>
"##;
let expected = r##"<foo bar>
@ -485,7 +500,7 @@ fn regression_test_35() {
}
#[test]
fn regression_test_36() {
fn regression_test_37() {
let original = r##"<foo bar =
"hi">
"##;
@ -497,7 +512,7 @@ fn regression_test_36() {
}
#[test]
fn regression_test_37() {
fn regression_test_38() {
let original = r##"~~*_**__
__a__
@ -510,7 +525,7 @@ __a__
}
#[test]
fn regression_test_38() {
fn regression_test_39() {
let original = r##"> `
> `
"##;
@ -523,7 +538,7 @@ fn regression_test_38() {
}
#[test]
fn regression_test_39() {
fn regression_test_40() {
let original = r##"`\|`
"##;
let expected = r##"<p><code>\|</code></p>
@ -533,7 +548,7 @@ fn regression_test_39() {
}
#[test]
fn regression_test_40() {
fn regression_test_41() {
let original = r##"Paragraph 1
Paragraph 2
@ -546,7 +561,7 @@ Paragraph 2
}
#[test]
fn regression_test_41() {
fn regression_test_42() {
let original = r##"\[[link text](https://www.google.com/)\]
"##;
let expected = r##"<p>[<a href="https://www.google.com/">link text</a>]</p>
@ -556,7 +571,7 @@ fn regression_test_41() {
}
#[test]
fn regression_test_42() {
fn regression_test_43() {
let original = r##"foo | bar
--- | ---
[a](< | url>)
@ -568,7 +583,7 @@ fn regression_test_42() {
}
#[test]
fn regression_test_43() {
fn regression_test_44() {
let original = r##"[a](url "
- - -
")
@ -582,7 +597,7 @@ fn regression_test_43() {
}
#[test]
fn regression_test_44() {
fn regression_test_45() {
let original = r##"[a](url
)
@ -595,7 +610,7 @@ fn regression_test_44() {
}
#[test]
fn regression_test_45() {
fn regression_test_46() {
let original = r##"[a](b "
")
@ -608,7 +623,7 @@ fn regression_test_45() {
}
#[test]
fn regression_test_46() {
fn regression_test_47() {
let original = r##"<http:// >
"##;
let expected = r##"<p>&lt;http:// &gt;</p>
@ -618,7 +633,7 @@ fn regression_test_46() {
}
#[test]
fn regression_test_47() {
fn regression_test_48() {
let original = r##"<http://>
"##;
let expected = r##"<p>&lt;http://&gt;</p>
@ -628,7 +643,7 @@ fn regression_test_47() {
}
#[test]
fn regression_test_48() {
fn regression_test_49() {
let original = r##"foo | bar
--- | ---
<http://| baz
@ -647,7 +662,7 @@ fn regression_test_48() {
}
#[test]
fn regression_test_49() {
fn regression_test_50() {
let original = r##"foo | bar
--- | ---
<http://|>
@ -666,7 +681,7 @@ fn regression_test_49() {
}
#[test]
fn regression_test_50() {
fn regression_test_51() {
let original = r##"<sup>\*hi</sup>\_
"##;
let expected = r##"<p><sup>*hi</sup>_</p>
@ -676,7 +691,7 @@ fn regression_test_50() {
}
#[test]
fn regression_test_51() {
fn regression_test_52() {
let original = r##"email: <john@example.com>\_
"##;
let expected = r##"<p>email: <a href="mailto:john@example.com">john@example.com</a>_</p>
@ -686,7 +701,7 @@ fn regression_test_51() {
}
#[test]
fn regression_test_52() {
fn regression_test_53() {
let original = r##"> [link](/url 'foo
> bar')
"##;
@ -700,7 +715,7 @@ bar">link</a></p>
}
#[test]
fn regression_test_53() {
fn regression_test_54() {
let original = r##"> [foo
> bar]: /url
>
@ -715,7 +730,7 @@ fn regression_test_53() {
}
#[test]
fn regression_test_54() {
fn regression_test_55() {
let original = r##"> [foo bar]: /url
>
> [foo
@ -731,7 +746,7 @@ bar</a></p>
}
#[test]
fn regression_test_55() {
fn regression_test_56() {
let original = r##"> - [a
> b c]: /foo
@ -749,7 +764,7 @@ fn regression_test_55() {
}
#[test]
fn regression_test_56() {
fn regression_test_57() {
let original = r##"[a
> b]: /foo
@ -766,7 +781,7 @@ fn regression_test_56() {
}
#[test]
fn regression_test_57() {
fn regression_test_58() {
let original = r##"[`cargo
package`]
@ -779,7 +794,7 @@ package`]
}
#[test]
fn regression_test_58() {
fn regression_test_59() {
let original = r##"> [`cargo
> package`]
@ -794,7 +809,7 @@ fn regression_test_58() {
}
#[test]
fn regression_test_59() {
fn regression_test_60() {
let original = r##"> `cargo
> package`
"##;
@ -806,7 +821,23 @@ fn regression_test_59() {
}
#[test]
fn regression_test_60() {
fn regression_test_61() {
let original = r##"> Note: Though you should not rely on this, all pointers to <abbr
> title="Dynamically Sized Types">DSTs</abbr> are currently twice the size of
> the size of `usize` and have the same alignment.
"##;
let expected = r##"<blockquote>
<p>Note: Though you should not rely on this, all pointers to
<abbr title="Dynamically Sized Types">DSTs</abbr> are currently twice the size of
the size of <code>usize</code> and have the same alignment.</p>
</blockquote>
"##;
test_markdown_html(original, expected, false);
}
#[test]
fn regression_test_62() {
let original = r##"Lorem ipsum.[^a]
An unordered list before the footnotes:
@ -830,7 +861,7 @@ An unordered list before the footnotes:
}
#[test]
fn regression_test_61() {
fn regression_test_63() {
let original = r##"[][a]
[a]: b
@ -849,7 +880,7 @@ fn regression_test_61() {
}
#[test]
fn regression_test_62() {
fn regression_test_64() {
let original = r##"* A list.
* A sublist.
@ -894,7 +925,7 @@ fn regression_test_62() {
}
#[test]
fn regression_test_63() {
fn regression_test_65() {
let original = r##"<foo
"##;
let expected = r##"<p>&lt;foo</p>
@ -902,3 +933,21 @@ fn regression_test_63() {
test_markdown_html(original, expected, false);
}
#[test]
fn regression_test_66() {
let original = r##"> > a <a href
> > ="yo
> > lo">
"##;
let expected = r##"<blockquote>
<blockquote>
<p>a <a href
="yo
lo"></p>
</blockquote>
</blockquote>
"##;
test_markdown_html(original, expected, false);
}

Loading…
Cancel
Save