where
S: Serializer,
{
let s = match self {
Self::Minimal => "minimal",
Self::Standard => "standard",
Self::Aggressive => "aggressive",
};
serializer.serialize_str(s)
}
}
}
#[cfg(all(test, any(feature = "serde", feature = "metadata")))]
mod tests {
use super::*;
#[test]
fn test_preprocessing_options_serde() {
let options = PreprocessingOptions {
enabled: true,
preset: PreprocessingPreset::Aggressive,
remove_navigation: false,
..Default::default()
};
// Serialize to JSON
let json = serde_json::to_string(&options).expect("Failed to serialize");
// Deserialize back
let deserialized: PreprocessingOptions = serde_json::from_str(&json).expect("Failed to deserialize");
// Verify values
assert!(deserialized.enabled);
assert_eq!(deserialized.preset, PreprocessingPreset::Aggressive);
assert!(!deserialized.remove_navigation);
}
}
================================================
FILE: crates/html-to-markdown/src/options/validation.rs
================================================
//! Validation and parsing utilities for option enums.
//!
//! This module provides parsing and serialization logic for configuration
//! enums (HeadingStyle, ListIndentType, etc.) with string conversion support.
/// Heading style options for Markdown output.
///
/// Controls how headings (h1-h6) are rendered in the output Markdown.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum HeadingStyle {
/// Underlined style (=== for h1, --- for h2).
Underlined,
/// ATX style (# for h1, ## for h2, etc.). Default.
#[default]
Atx,
/// ATX closed style (# title #, with closing hashes).
AtxClosed,
}
impl HeadingStyle {
/// Parse a heading style from a string.
///
/// Accepts "atx", "atxclosed", or defaults to Underlined.
/// Input is normalized (lowercased, alphanumeric only).
#[must_use]
pub fn parse(value: &str) -> Self {
match normalize_token(value).as_str() {
"atx" => Self::Atx,
"atxclosed" => Self::AtxClosed,
_ => Self::Underlined,
}
}
}
/// List indentation character type.
///
/// Controls whether list items are indented with spaces or tabs.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum ListIndentType {
/// Use spaces for indentation. Default. Width controlled by `list_indent_width`.
#[default]
Spaces,
/// Use tabs for indentation.
Tabs,
}
impl ListIndentType {
/// Parse a list indentation type from a string.
///
/// Accepts "tabs" or defaults to Spaces.
/// Input is normalized (lowercased, alphanumeric only).
#[must_use]
pub fn parse(value: &str) -> Self {
match normalize_token(value).as_str() {
"tabs" => Self::Tabs,
_ => Self::Spaces,
}
}
}
/// Whitespace handling strategy during conversion.
///
/// Determines how sequences of whitespace characters (spaces, tabs, newlines) are processed.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum WhitespaceMode {
/// Collapse multiple whitespace characters to single spaces. Default. Matches browser behavior.
#[default]
Normalized,
/// Preserve all whitespace exactly as it appears in the HTML.
Strict,
}
impl WhitespaceMode {
/// Parse a whitespace mode from a string.
///
/// Accepts "strict" or defaults to Normalized.
/// Input is normalized (lowercased, alphanumeric only).
#[must_use]
pub fn parse(value: &str) -> Self {
match normalize_token(value).as_str() {
"strict" => Self::Strict,
_ => Self::Normalized,
}
}
}
/// Line break syntax in Markdown output.
///
/// Controls how soft line breaks (from ` ` or line breaks in source) are rendered.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum NewlineStyle {
/// Two trailing spaces at end of line. Default. Standard Markdown syntax.
#[default]
Spaces,
/// Backslash at end of line. Alternative Markdown syntax.
Backslash,
}
impl NewlineStyle {
/// Parse a newline style from a string.
///
/// Accepts "backslash" or defaults to Spaces.
/// Input is normalized (lowercased, alphanumeric only).
#[must_use]
pub fn parse(value: &str) -> Self {
match normalize_token(value).as_str() {
"backslash" => Self::Backslash,
_ => Self::Spaces,
}
}
}
/// Code block fence style in Markdown output.
///
/// Determines how code blocks (``) are rendered in Markdown.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum CodeBlockStyle {
/// Indented code blocks (4 spaces). `CommonMark` standard.
Indented,
/// Fenced code blocks with backticks (```). Default (GFM). Supports language hints.
#[default]
Backticks,
/// Fenced code blocks with tildes (~~~). Supports language hints.
Tildes,
}
impl CodeBlockStyle {
/// Parse a code block style from a string.
///
/// Accepts "backticks", "tildes", or defaults to Indented.
/// Input is normalized (lowercased, alphanumeric only).
#[must_use]
pub fn parse(value: &str) -> Self {
match normalize_token(value).as_str() {
"backticks" => Self::Backticks,
"tildes" => Self::Tildes,
_ => Self::Indented,
}
}
}
/// Highlight rendering style for `` elements.
///
/// Controls how highlighted text is rendered in Markdown output.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum HighlightStyle {
/// Double equals syntax (==text==). Default. Pandoc-compatible.
#[default]
DoubleEqual,
/// Preserve as HTML (==text==). Original HTML tag.
Html,
/// Render as bold (**text**). Uses strong emphasis.
Bold,
/// Strip formatting, render as plain text. No markup.
None,
}
impl HighlightStyle {
/// Parse a highlight style from a string.
///
/// Accepts "doubleequal", "html", "bold", "none", or defaults to None.
/// Input is normalized (lowercased, alphanumeric only).
#[must_use]
pub fn parse(value: &str) -> Self {
match normalize_token(value).as_str() {
"doubleequal" => Self::DoubleEqual,
"html" => Self::Html,
"bold" => Self::Bold,
"none" => Self::None,
_ => Self::None,
}
}
}
/// Link rendering style in Markdown output.
///
/// Controls whether links and images use inline `[text](url)` syntax or
/// reference-style `[text][1]` syntax with definitions collected at the end.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum LinkStyle {
/// Inline links: `[text](url)`. Default.
#[default]
Inline,
/// Reference-style links: `[text][1]` with `[1]: url` at end of document.
Reference,
}
impl LinkStyle {
/// Parse a link style from a string.
///
/// Accepts "reference" or defaults to Inline.
/// Input is normalized (lowercased, alphanumeric only).
#[must_use]
pub fn parse(value: &str) -> Self {
match normalize_token(value).as_str() {
"reference" => Self::Reference,
_ => Self::Inline,
}
}
}
/// Output format for conversion.
///
/// Specifies the target markup language format for the conversion output.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum OutputFormat {
/// Standard Markdown (CommonMark compatible). Default.
#[default]
Markdown,
/// Djot lightweight markup language.
Djot,
/// Plain text output (no markup, visible text only).
Plain,
}
impl OutputFormat {
/// Parse an output format from a string.
///
/// Accepts "djot" or defaults to Markdown.
/// Input is normalized (lowercased, alphanumeric only).
#[must_use]
pub fn parse(value: &str) -> Self {
match normalize_token(value).as_str() {
"djot" => Self::Djot,
"plain" | "plaintext" | "text" => Self::Plain,
_ => Self::Markdown,
}
}
}
/// Normalize a configuration string by lowercasing and removing non-alphanumeric characters.
pub(crate) fn normalize_token(value: &str) -> String {
let mut out = String::with_capacity(value.len());
for ch in value.chars() {
if ch.is_ascii_alphanumeric() {
out.push(ch.to_ascii_lowercase());
}
}
out
}
#[cfg(any(feature = "serde", feature = "metadata"))]
mod serde_impls {
use super::{
CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat,
WhitespaceMode,
};
use serde::{Deserialize, Serialize, Serializer};
macro_rules! impl_deserialize_from_parse {
($ty:ty, $parser:expr) => {
impl<'de> Deserialize<'de> for $ty {
fn deserialize(deserializer: D) -> Result
where
D: serde::Deserializer<'de>,
{
let value = String::deserialize(deserializer)?;
Ok($parser(&value))
}
}
};
}
impl_deserialize_from_parse!(HeadingStyle, HeadingStyle::parse);
impl_deserialize_from_parse!(ListIndentType, ListIndentType::parse);
impl_deserialize_from_parse!(WhitespaceMode, WhitespaceMode::parse);
impl_deserialize_from_parse!(NewlineStyle, NewlineStyle::parse);
impl_deserialize_from_parse!(CodeBlockStyle, CodeBlockStyle::parse);
impl_deserialize_from_parse!(HighlightStyle, HighlightStyle::parse);
impl_deserialize_from_parse!(LinkStyle, LinkStyle::parse);
impl_deserialize_from_parse!(OutputFormat, OutputFormat::parse);
// Serialize implementations that convert enum variants to their string representations
impl Serialize for HeadingStyle {
fn serialize(&self, serializer: S) -> Result
where
S: Serializer,
{
let s = match self {
Self::Underlined => "underlined",
Self::Atx => "atx",
Self::AtxClosed => "atxclosed",
};
serializer.serialize_str(s)
}
}
impl Serialize for ListIndentType {
fn serialize(&self, serializer: S) -> Result
where
S: Serializer,
{
let s = match self {
Self::Spaces => "spaces",
Self::Tabs => "tabs",
};
serializer.serialize_str(s)
}
}
impl Serialize for WhitespaceMode {
fn serialize(&self, serializer: S) -> Result
where
S: Serializer,
{
let s = match self {
Self::Normalized => "normalized",
Self::Strict => "strict",
};
serializer.serialize_str(s)
}
}
impl Serialize for NewlineStyle {
fn serialize(&self, serializer: S) -> Result
where
S: Serializer,
{
let s = match self {
Self::Spaces => "spaces",
Self::Backslash => "backslash",
};
serializer.serialize_str(s)
}
}
impl Serialize for CodeBlockStyle {
fn serialize(&self, serializer: S) -> Result
where
S: Serializer,
{
let s = match self {
Self::Indented => "indented",
Self::Backticks => "backticks",
Self::Tildes => "tildes",
};
serializer.serialize_str(s)
}
}
impl Serialize for HighlightStyle {
fn serialize(&self, serializer: S) -> Result
where
S: Serializer,
{
let s = match self {
Self::DoubleEqual => "doubleequal",
Self::Html => "html",
Self::Bold => "bold",
Self::None => "none",
};
serializer.serialize_str(s)
}
}
impl Serialize for LinkStyle {
fn serialize(&self, serializer: S) -> Result
where
S: Serializer,
{
let s = match self {
Self::Inline => "inline",
Self::Reference => "reference",
};
serializer.serialize_str(s)
}
}
impl Serialize for OutputFormat {
fn serialize(&self, serializer: S) -> Result
where
S: Serializer,
{
let s = match self {
Self::Markdown => "markdown",
Self::Djot => "djot",
Self::Plain => "plain",
};
serializer.serialize_str(s)
}
}
}
#[cfg(all(test, any(feature = "serde", feature = "metadata")))]
mod tests {
use super::*;
#[test]
fn test_enum_serialization() {
// Test that enums serialize to lowercase strings
let heading = HeadingStyle::AtxClosed;
let json = serde_json::to_string(&heading).expect("Failed to serialize");
assert_eq!(json, r#""atxclosed""#);
let list_indent = ListIndentType::Tabs;
let json = serde_json::to_string(&list_indent).expect("Failed to serialize");
assert_eq!(json, r#""tabs""#);
let whitespace = WhitespaceMode::Strict;
let json = serde_json::to_string(&whitespace).expect("Failed to serialize");
assert_eq!(json, r#""strict""#);
}
#[test]
fn test_enum_deserialization() {
// Test that enums deserialize from strings (case insensitive)
let heading: HeadingStyle = serde_json::from_str(r#""atxclosed""#).expect("Failed");
assert_eq!(heading, HeadingStyle::AtxClosed);
let heading: HeadingStyle = serde_json::from_str(r#""ATXCLOSED""#).expect("Failed");
assert_eq!(heading, HeadingStyle::AtxClosed);
let list_indent: ListIndentType = serde_json::from_str(r#""tabs""#).expect("Failed");
assert_eq!(list_indent, ListIndentType::Tabs);
}
}
================================================
FILE: crates/html-to-markdown/src/prelude.rs
================================================
//! Prelude module for convenient internal imports.
================================================
FILE: crates/html-to-markdown/src/rcdom.rs
================================================
// Vendored from markup5ever_rcdom v0.36.0+unofficial
// Original source: https://github.com/servo/html5ever (rcdom/)
// Copyright (c) 2014 The html5ever Project Developers
// Licensed under MIT OR Apache-2.0 (see ATTRIBUTIONS.md)
//
// Vendored to:
// - Remove unused xml5ever transitive dependency
// - Eliminate pinned external dependency on "+unofficial" crate
// - Gain full control over this small, critical module
//
// Changes from upstream:
// - Replaced `extern crate markup5ever` / `extern crate tendril` with
// `use` imports through `html5ever` (edition 2024 compatibility)
// - Added module-level clippy allows for vendored code style
#![allow(
clippy::panic,
clippy::expect_used,
clippy::missing_panics_doc,
clippy::must_use_candidate,
clippy::return_self_not_must_use,
clippy::module_name_repetitions,
clippy::redundant_else,
clippy::match_wildcard_for_single_variants,
clippy::similar_names,
clippy::items_after_statements,
clippy::use_self,
clippy::missing_fields_in_debug,
clippy::semicolon_if_nothing_returned,
missing_docs
)]
//! A simple reference-counted DOM.
//!
//! This is sufficient as a static parse tree, but don't build a
//! web browser using it. :)
use std::borrow::Cow;
use std::cell::{Cell, RefCell};
use std::collections::{HashSet, VecDeque};
use std::default::Default;
use std::fmt;
use std::io;
use std::mem;
use std::rc::{Rc, Weak};
use html5ever::tendril::StrTendril;
use html5ever::Attribute;
use html5ever::ExpandedName;
use html5ever::QualName;
use html5ever::interface::tree_builder;
use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
use html5ever::serialize::TraversalScope;
use html5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
use html5ever::serialize::{Serialize, Serializer};
/// The different kinds of nodes in the DOM.
#[derive(Debug)]
pub enum NodeData {
/// The `Document` itself - the root node of a HTML document.
Document,
/// A `DOCTYPE` with name, public id, and system id. See
/// [document type declaration on wikipedia][dtd wiki].
///
/// [dtd wiki]: https://en.wikipedia.org/wiki/Document_type_declaration
Doctype {
name: StrTendril,
// Fields required by html5ever's DOM model; not accessed during conversion.
#[allow(dead_code)]
public_id: StrTendril,
#[allow(dead_code)]
system_id: StrTendril,
},
/// A text node.
Text { contents: RefCell },
/// A comment.
Comment { contents: StrTendril },
/// An element with attributes.
Element {
name: QualName,
attrs: RefCell>,
/// For HTML \ elements, the [template contents].
///
/// [template contents]: https://html.spec.whatwg.org/multipage/#template-contents
template_contents: RefCell>,
/// Whether the node is a [HTML integration point].
///
/// [HTML integration point]: https://html.spec.whatwg.org/multipage/#html-integration-point
mathml_annotation_xml_integration_point: bool,
},
/// A Processing instruction.
ProcessingInstruction { target: StrTendril, contents: StrTendril },
}
/// A DOM node.
pub struct Node {
/// Parent node.
pub parent: Cell >,
/// Child nodes of this node.
pub children: RefCell>,
/// Represents this node's data.
pub data: NodeData,
}
impl Node {
/// Create a new node from its contents
pub fn new(data: NodeData) -> Rc {
Rc::new(Node {
data,
parent: Cell::new(None),
children: RefCell::new(Vec::new()),
})
}
}
impl Drop for Node {
fn drop(&mut self) {
let mut nodes = mem::take(&mut *self.children.borrow_mut());
while let Some(node) = nodes.pop() {
let children = mem::take(&mut *node.children.borrow_mut());
nodes.extend(children);
if let NodeData::Element {
ref template_contents, ..
} = node.data
{
if let Some(template_contents) = template_contents.borrow_mut().take() {
nodes.push(template_contents);
}
}
}
}
}
impl fmt::Debug for Node {
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
fmt.debug_struct("Node")
.field("data", &self.data)
.field("children", &self.children)
.finish()
}
}
/// Reference to a DOM node.
pub type Handle = Rc;
/// Weak reference to a DOM node, used for parent pointers.
pub type WeakHandle = Weak;
/// Append a parentless node to another nodes' children
fn append(new_parent: &Handle, child: Handle) {
let previous_parent = child.parent.replace(Some(Rc::downgrade(new_parent)));
// Invariant: child cannot have existing parent
assert!(previous_parent.is_none());
new_parent.children.borrow_mut().push(child);
}
/// If the node has a parent, get it and this node's position in its children
fn get_parent_and_index(target: &Handle) -> Option<(Handle, usize)> {
if let Some(weak) = target.parent.take() {
let parent = weak.upgrade().expect("dangling weak pointer");
target.parent.set(Some(weak));
let i = match parent
.children
.borrow()
.iter()
.enumerate()
.find(|&(_, child)| Rc::ptr_eq(child, target))
{
Some((i, _)) => i,
None => panic!("have parent but couldn't find in parent's children!"),
};
Some((parent, i))
} else {
None
}
}
fn append_to_existing_text(prev: &Handle, text: &str) -> bool {
match prev.data {
NodeData::Text { ref contents } => {
contents.borrow_mut().push_slice(text);
true
}
_ => false,
}
}
fn remove_from_parent(target: &Handle) {
if let Some((parent, i)) = get_parent_and_index(target) {
parent.children.borrow_mut().remove(i);
target.parent.set(None);
}
}
/// The DOM itself; the result of parsing.
pub struct RcDom {
/// The `Document` itself.
pub document: Handle,
/// Errors that occurred during parsing.
pub errors: RefCell>>,
/// The document's quirks mode.
pub quirks_mode: Cell,
}
impl TreeSink for RcDom {
type Output = Self;
fn finish(self) -> Self {
self
}
type Handle = Handle;
type ElemName<'a>
= ExpandedName<'a>
where
Self: 'a;
fn parse_error(&self, msg: Cow<'static, str>) {
self.errors.borrow_mut().push(msg);
}
fn get_document(&self) -> Handle {
self.document.clone()
}
fn get_template_contents(&self, target: &Handle) -> Handle {
if let NodeData::Element {
ref template_contents, ..
} = target.data
{
template_contents
.borrow()
.as_ref()
.expect("not a template element!")
.clone()
} else {
panic!("not a template element!")
}
}
fn set_quirks_mode(&self, mode: QuirksMode) {
self.quirks_mode.set(mode);
}
fn same_node(&self, x: &Handle, y: &Handle) -> bool {
Rc::ptr_eq(x, y)
}
fn elem_name<'a>(&self, target: &'a Handle) -> ExpandedName<'a> {
match target.data {
NodeData::Element { ref name, .. } => name.expanded(),
_ => panic!("not an element!"),
}
}
fn create_element(&self, name: QualName, attrs: Vec, flags: ElementFlags) -> Handle {
Node::new(NodeData::Element {
name,
attrs: RefCell::new(attrs),
template_contents: RefCell::new(if flags.template {
Some(Node::new(NodeData::Document))
} else {
None
}),
mathml_annotation_xml_integration_point: flags.mathml_annotation_xml_integration_point,
})
}
fn create_comment(&self, text: StrTendril) -> Handle {
Node::new(NodeData::Comment { contents: text })
}
fn create_pi(&self, target: StrTendril, data: StrTendril) -> Handle {
Node::new(NodeData::ProcessingInstruction { target, contents: data })
}
fn append(&self, parent: &Handle, child: NodeOrText) {
// Append to an existing Text node if we have one.
if let NodeOrText::AppendText(text) = &child {
if let Some(h) = parent.children.borrow().last() {
if append_to_existing_text(h, text) {
return;
}
}
}
append(
parent,
match child {
NodeOrText::AppendText(text) => Node::new(NodeData::Text {
contents: RefCell::new(text),
}),
NodeOrText::AppendNode(node) => node,
},
);
}
fn append_before_sibling(&self, sibling: &Handle, child: NodeOrText) {
let (parent, i) = get_parent_and_index(sibling).expect("append_before_sibling called on node without parent");
let child = match (child, i) {
// No previous node.
(NodeOrText::AppendText(text), 0) => Node::new(NodeData::Text {
contents: RefCell::new(text),
}),
// Look for a text node before the insertion point.
(NodeOrText::AppendText(text), i) => {
let children = parent.children.borrow();
let prev = &children[i - 1];
if append_to_existing_text(prev, &text) {
return;
}
Node::new(NodeData::Text {
contents: RefCell::new(text),
})
}
// The tree builder promises we won't have a text node after
// the insertion point.
// Any other kind of node.
(NodeOrText::AppendNode(node), _) => node,
};
remove_from_parent(&child);
child.parent.set(Some(Rc::downgrade(&parent)));
parent.children.borrow_mut().insert(i, child);
}
fn append_based_on_parent_node(
&self,
element: &Self::Handle,
prev_element: &Self::Handle,
child: NodeOrText,
) {
let parent = element.parent.take();
let has_parent = parent.is_some();
element.parent.set(parent);
if has_parent {
self.append_before_sibling(element, child);
} else {
self.append(prev_element, child);
}
}
fn append_doctype_to_document(&self, name: StrTendril, public_id: StrTendril, system_id: StrTendril) {
append(
&self.document,
Node::new(NodeData::Doctype {
name,
public_id,
system_id,
}),
);
}
fn add_attrs_if_missing(&self, target: &Handle, attrs: Vec) {
let mut existing = if let NodeData::Element { ref attrs, .. } = target.data {
attrs.borrow_mut()
} else {
panic!("not an element")
};
let existing_names = existing.iter().map(|e| e.name.clone()).collect::>();
existing.extend(attrs.into_iter().filter(|attr| !existing_names.contains(&attr.name)));
}
fn remove_from_parent(&self, target: &Handle) {
remove_from_parent(target);
}
fn reparent_children(&self, node: &Handle, new_parent: &Handle) {
let mut children = node.children.borrow_mut();
let mut new_children = new_parent.children.borrow_mut();
for child in children.iter() {
let previous_parent = child.parent.replace(Some(Rc::downgrade(new_parent)));
assert!(Rc::ptr_eq(
node,
&previous_parent
.expect("invariant: child must have a parent during reparenting")
.upgrade()
.expect("dangling weak")
))
}
new_children.extend(mem::take(&mut *children));
}
fn is_mathml_annotation_xml_integration_point(&self, target: &Handle) -> bool {
if let NodeData::Element {
mathml_annotation_xml_integration_point,
..
} = target.data
{
mathml_annotation_xml_integration_point
} else {
panic!("not an element!")
}
}
}
impl Default for RcDom {
fn default() -> RcDom {
RcDom {
document: Node::new(NodeData::Document),
errors: Default::default(),
quirks_mode: Cell::new(tree_builder::NoQuirks),
}
}
}
enum SerializeOp {
Open(Handle),
Close(QualName),
}
pub struct SerializableHandle(Handle);
impl From for SerializableHandle {
fn from(h: Handle) -> SerializableHandle {
SerializableHandle(h)
}
}
impl Serialize for SerializableHandle {
fn serialize(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()>
where
S: Serializer,
{
let mut ops = VecDeque::new();
match traversal_scope {
IncludeNode => ops.push_back(SerializeOp::Open(self.0.clone())),
ChildrenOnly(_) => ops.extend(self.0.children.borrow().iter().map(|h| SerializeOp::Open(h.clone()))),
}
while let Some(op) = ops.pop_front() {
match op {
SerializeOp::Open(handle) => match handle.data {
NodeData::Element {
ref name, ref attrs, ..
} => {
serializer
.start_elem(name.clone(), attrs.borrow().iter().map(|at| (&at.name, &at.value[..])))?;
ops.reserve(1 + handle.children.borrow().len());
ops.push_front(SerializeOp::Close(name.clone()));
for child in handle.children.borrow().iter().rev() {
ops.push_front(SerializeOp::Open(child.clone()));
}
}
NodeData::Doctype { ref name, .. } => serializer.write_doctype(name)?,
NodeData::Text { ref contents } => serializer.write_text(&contents.borrow())?,
NodeData::Comment { ref contents } => serializer.write_comment(contents)?,
NodeData::ProcessingInstruction {
ref target,
ref contents,
} => serializer.write_processing_instruction(target, contents)?,
NodeData::Document => panic!("Can't serialize Document node itself"),
},
SerializeOp::Close(name) => {
serializer.end_elem(name)?;
}
}
}
Ok(())
}
}
================================================
FILE: crates/html-to-markdown/src/text.rs
================================================
#![allow(clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::unused_self)]
//! Text processing utilities for Markdown conversion.
use regex::Regex;
use std::borrow::Cow;
use std::sync::LazyLock;
/// Regex for escaping miscellaneous characters
static ESCAPE_MISC_RE: LazyLock =
LazyLock::new(|| Regex::new(r"([\\&<`\[\]>~#=+|\-])").expect("valid regex pattern"));
/// Regex for escaping numbered lists
static ESCAPE_NUMBERED_LIST_RE: LazyLock =
LazyLock::new(|| Regex::new(r"([0-9])([.)])").expect("valid regex pattern"));
/// Regex for escaping ASCII punctuation (CommonMark spec example 12)
/// Matches: `! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ \` { | } ~`
static ESCAPE_ASCII_RE: LazyLock =
LazyLock::new(|| Regex::new(r"([!\x22#$%&\x27()*+,\-./:;<=>?@\[\\\]^_`{|}~])").expect("valid regex pattern"));
/// Escape Markdown special characters in text.
///
/// # Arguments
///
/// * `text` - Text to escape
/// * `escape_misc` - Escape miscellaneous characters (`\` `&` `<` `` ` `` `[` `>` `~` `#` `=` `+` `|` `-`)
/// * `escape_asterisks` - Escape asterisks (`*`)
/// * `escape_underscores` - Escape underscores (`_`)
/// * `escape_ascii` - Escape all ASCII punctuation (for `CommonMark` spec compliance)
///
/// # Returns
///
/// Escaped text
#[allow(clippy::fn_params_excessive_bools)]
pub fn escape(
text: &str,
escape_misc: bool,
escape_asterisks: bool,
escape_underscores: bool,
escape_ascii: bool,
) -> Cow<'_, str> {
if text.is_empty() {
return Cow::Borrowed("");
}
if !escape_misc && !escape_asterisks && !escape_underscores && !escape_ascii {
return Cow::Borrowed(text);
}
if escape_ascii
&& !text.as_bytes().iter().any(|b| {
matches!(
b,
b'!' | b'"'
| b'#'
| b'$'
| b'%'
| b'&'
| b'\''
| b'('
| b')'
| b'*'
| b'+'
| b','
| b'-'
| b'.'
| b'/'
| b':'
| b';'
| b'<'
| b'='
| b'>'
| b'?'
| b'@'
| b'['
| b'\\'
| b']'
| b'^'
| b'_'
| b'`'
| b'{'
| b'|'
| b'}'
| b'~'
)
})
{
return Cow::Borrowed(text);
}
if !escape_ascii && escape_misc && !escape_asterisks && !escape_underscores {
let needs_misc = text.as_bytes().iter().any(|b| {
matches!(
b,
b'\\' | b'&' | b'<' | b'`' | b'[' | b']' | b'>' | b'~' | b'#' | b'=' | b'+' | b'|' | b'-'
)
});
let needs_numbered = text.as_bytes().iter().any(|b| matches!(b, b'.' | b')'));
if !needs_misc && !needs_numbered {
return Cow::Borrowed(text);
}
}
let mut result: Cow<'_, str> = Cow::Borrowed(text);
if escape_ascii {
result = match ESCAPE_ASCII_RE.replace_all(result.as_ref(), r"\$1") {
Cow::Borrowed(_) => result,
Cow::Owned(s) => Cow::Owned(s),
};
return result;
}
if escape_misc {
result = match ESCAPE_MISC_RE.replace_all(result.as_ref(), r"\$1") {
Cow::Borrowed(_) => result,
Cow::Owned(s) => Cow::Owned(s),
};
result = match ESCAPE_NUMBERED_LIST_RE.replace_all(result.as_ref(), r"$1\$2") {
Cow::Borrowed(_) => result,
Cow::Owned(s) => Cow::Owned(s),
};
}
if escape_asterisks && result.contains('*') {
result = Cow::Owned(result.replace('*', r"\*"));
}
if escape_underscores && result.contains('_') {
result = Cow::Owned(result.replace('_', r"\_"));
}
result
}
/// Extract boundary whitespace from text (chomp).
///
/// Returns (prefix, suffix, `trimmed_text`) tuple.
/// Prefix/suffix are " " if original text had leading/trailing whitespace.
/// However, suffix is "" if the trailing whitespace is only newlines (not spaces/tabs).
/// This prevents trailing newlines from becoming trailing spaces in the output.
/// The trimmed text has all leading/trailing whitespace removed.
#[must_use]
pub fn chomp(text: &str) -> (&str, &str, &str) {
if text.is_empty() {
return ("", "", "");
}
let prefix = if text.starts_with(|c: char| c.is_whitespace()) {
" "
} else {
""
};
let suffix = if text.ends_with("\n\n") || text.ends_with("\r\n\r\n") {
"\n\n"
} else if text.ends_with([' ', '\t']) {
" "
} else {
""
};
let trimmed = if suffix == "\n\n" {
text.trim_end_matches("\n\n").trim_end_matches("\r\n\r\n").trim()
} else {
text.trim()
};
(prefix, suffix, trimmed)
}
/// Normalize whitespace by collapsing consecutive spaces and tabs.
///
/// Multiple spaces and tabs are replaced with a single space.
/// Newlines are preserved.
/// Unicode spaces are normalized to ASCII spaces.
///
/// # Arguments
///
/// * `text` - The text to normalize
///
/// # Returns
///
/// Normalized text with collapsed spaces/tabs but preserved newlines
#[must_use]
pub fn normalize_whitespace(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut prev_was_space = false;
for ch in text.chars() {
let is_space = ch == ' ' || ch == '\t' || is_unicode_space(ch);
if is_space {
if !prev_was_space {
result.push(' ');
prev_was_space = true;
}
} else {
result.push(ch);
prev_was_space = false;
}
}
result
}
/// Normalize whitespace in text, returning borrowed or owned result as needed.
///
/// This function optimizes memory by returning a borrowed reference when no normalization
/// is needed, and only allocating a new string when whitespace changes are necessary.
///
/// Multiple consecutive spaces, tabs, and Unicode space characters are replaced with
/// a single ASCII space. Newlines are preserved as-is.
///
/// # Arguments
///
/// * `text` - The text to normalize
///
/// # Returns
///
/// `Cow::Borrowed` if text is already normalized, or `Cow::Owned` with normalized text
#[must_use]
pub fn normalize_whitespace_cow(text: &str) -> Cow<'_, str> {
let mut prev_was_space = false;
for ch in text.chars() {
let is_space = ch == ' ' || ch == '\t' || is_unicode_space(ch);
if is_space {
if prev_was_space || ch != ' ' {
return Cow::Owned(normalize_whitespace(text));
}
prev_was_space = true;
} else {
prev_was_space = false;
}
}
Cow::Borrowed(text)
}
/// Decode common HTML entities.
///
/// Decodes the most common HTML entities to their character equivalents:
/// - `"` → `"`
/// - `'` → `'`
/// - `<` → `<`
/// - `>` → `>`
/// - `&` → `&` (must be last to avoid double-decoding)
///
/// # Arguments
///
/// * `text` - Text containing HTML entities
///
/// # Returns
///
/// Text with entities decoded
#[must_use]
pub fn decode_html_entities(text: &str) -> String {
html_escape::decode_html_entities(text).into_owned()
}
/// Decode HTML entities in text, returning borrowed or owned result as needed.
///
/// This function optimizes memory by returning a borrowed reference when no HTML
/// entities are present, and only allocating a new string when entity decoding
/// is necessary.
///
/// Decodes common HTML entities like:
/// - `"` → `"`
/// - `'` → `'`
/// - `<` → `<`
/// - `>` → `>`
/// - `&` → `&` (decoded last to avoid double-decoding)
///
/// # Arguments
///
/// * `text` - Text potentially containing HTML entities
///
/// # Returns
///
/// `Cow::Borrowed` if no entities found, or `Cow::Owned` with entities decoded
#[must_use]
pub fn decode_html_entities_cow(text: &str) -> Cow<'_, str> {
if !text.contains('&') {
return Cow::Borrowed(text);
}
html_escape::decode_html_entities(text)
}
/// Check if a character is a unicode space character.
///
/// Includes: non-breaking space, various width spaces, etc.
const fn is_unicode_space(ch: char) -> bool {
matches!(
ch,
'\u{00A0}'
| '\u{1680}'
| '\u{2000}'
| '\u{2001}'
| '\u{2002}'
| '\u{2003}'
| '\u{2004}'
| '\u{2005}'
| '\u{2006}'
| '\u{2007}'
| '\u{2008}'
| '\u{2009}'
| '\u{200A}'
| '\u{202F}'
| '\u{205F}'
| '\u{3000}'
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_escape_misc() {
assert_eq!(escape("foo & bar", true, false, false, false), r"foo \& bar");
assert_eq!(escape("foo [bar]", true, false, false, false), r"foo \[bar\]");
assert_eq!(escape("1. Item", true, false, false, false), r"1\. Item");
assert_eq!(escape("1) Item", true, false, false, false), r"1\) Item");
}
#[test]
fn test_escape_asterisks() {
assert_eq!(escape("foo * bar", false, true, false, false), r"foo \* bar");
assert_eq!(escape("**bold**", false, true, false, false), r"\*\*bold\*\*");
}
#[test]
fn test_escape_underscores() {
assert_eq!(escape("foo_bar", false, false, true, false), r"foo\_bar");
assert_eq!(escape("__bold__", false, false, true, false), r"\_\_bold\_\_");
}
#[test]
fn test_escape_ascii() {
assert_eq!(escape(r##"!"#$%&"##, false, false, false, true), r#"\!\"\#\$\%\&"#);
assert_eq!(escape("*+,-./", false, false, false, true), r"\*\+\,\-\.\/");
assert_eq!(escape("<=>?@", false, false, false, true), r"\<\=\>\?\@");
assert_eq!(escape(r"[\]^_`", false, false, false, true), r"\[\\\]\^\_\`");
assert_eq!(escape("{|}~", false, false, false, true), r"\{\|\}\~");
}
#[test]
fn test_chomp() {
assert_eq!(chomp(" text "), (" ", " ", "text"));
assert_eq!(chomp("text"), ("", "", "text"));
assert_eq!(chomp(" text"), (" ", "", "text"));
assert_eq!(chomp("text "), ("", " ", "text"));
assert_eq!(chomp(""), ("", "", ""));
}
}
================================================
FILE: crates/html-to-markdown/src/types/document.rs
================================================
//! Structured document tree types aligned with kreuzberg's `DocumentStructure`.
use std::collections::HashMap;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use super::tables::TableGrid;
/// A structured document tree representing the semantic content of an HTML document.
///
/// Uses a flat node array with index-based parent/child references for efficient traversal.
#[derive(Debug, Clone, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct DocumentStructure {
/// All nodes in document reading order.
pub nodes: Vec,
/// The source format (always "html" for this crate).
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
pub source_format: Option,
}
/// A single node in the document tree.
#[derive(Debug, Clone, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct DocumentNode {
/// Deterministic node identifier.
pub id: String,
/// The semantic content of this node.
pub content: NodeContent,
/// Index of the parent node (None for root nodes).
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
pub parent: Option,
/// Indices of child nodes in reading order.
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Vec::is_empty", default))]
pub children: Vec,
/// Inline formatting annotations (bold, italic, links, etc.) with byte offsets into the text.
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Vec::is_empty", default))]
pub annotations: Vec,
/// Format-specific attributes (e.g. class, id, data-* attributes).
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
pub attributes: Option>,
}
/// The semantic content type of a document node.
///
/// Uses internally tagged representation (`"node_type": "heading"`) for JSON serialization.
#[derive(Debug, Clone, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "serde", serde(tag = "node_type", rename_all = "snake_case"))]
pub enum NodeContent {
/// A heading element (h1-h6).
Heading {
/// Heading level (1-6).
level: u8,
/// The heading text content.
text: String,
},
/// A paragraph of text.
Paragraph {
/// The paragraph text content.
text: String,
},
/// A list container (ordered or unordered). Children are `ListItem` nodes.
List {
/// Whether this is an ordered list.
ordered: bool,
},
/// A single list item.
ListItem {
/// The list item text content.
text: String,
},
/// A table with structured cell data.
Table {
/// The table grid structure.
grid: TableGrid,
},
/// An image element.
Image {
/// Alt text or caption.
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
description: Option,
/// Image source URL.
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
src: Option,
/// Index into `ConversionResult.images` when image extraction is enabled.
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
image_index: Option,
},
/// A code block or inline code.
Code {
/// The code text content.
text: String,
/// Programming language (from class="language-*" or similar).
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
language: Option,
},
/// A block quote container.
Quote,
/// A definition list container.
DefinitionList,
/// A definition list entry with term and description.
DefinitionItem {
/// The term being defined.
term: String,
/// The definition text.
definition: String,
},
/// A raw block preserved as-is (e.g. `
Title
Hello
"#;
let result = html_to_markdown_rs::convert(html, None).expect("convert failed");
let metadata = result.metadata;
assert_eq!(metadata.structured_data.len(), 1);
assert!(metadata.structured_data[0].raw_json.contains(r#""@type": "Article""#));
assert_eq!(metadata.structured_data[0].schema_type.as_deref(), Some("Article"));
}
#[test]
fn extracts_json_ld_from_body_script_and_keeps_content() {
let html = r#"
Title
"#;
let result = html_to_markdown_rs::convert(html, None).expect("convert failed");
let metadata = result.metadata;
assert_eq!(metadata.structured_data.len(), 1);
assert!(!metadata.structured_data[0].raw_json.trim().is_empty());
assert_eq!(metadata.structured_data[0].schema_type.as_deref(), Some("Article"));
}
================================================
FILE: crates/html-to-markdown/tests/lists_test.rs
================================================
#![allow(missing_docs)]
fn convert(
html: &str,
opts: Option,
) -> html_to_markdown_rs::error::Result {
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
}
use html_to_markdown_rs::ConversionOptions;
#[test]
fn test_basic_unordered_list() {
let html = r"";
let result = convert(html, None).unwrap();
assert!(result.contains("- Item 1"));
assert!(result.contains("- Item 2"));
assert!(result.contains("- Item 3"));
}
#[test]
fn test_basic_ordered_list() {
let html = r"
First
Second
Third
";
let result = convert(html, None).unwrap();
assert!(result.contains("1. First"));
assert!(result.contains("2. Second"));
assert!(result.contains("3. Third"));
}
#[test]
fn test_nested_lists() {
let html = r"";
let result = convert(html, None).unwrap();
assert!(result.contains("- Item 1"));
assert!(result.contains("* Nested 1"));
assert!(result.contains("* Nested 2"));
assert!(result.contains("- Item 2"));
}
#[test]
fn test_ordered_nested_in_unordered() {
let html = r"
Outer item
Inner item 1
Inner item 2
";
let result = convert(html, None).unwrap();
assert!(result.contains("- Outer item"));
assert!(result.contains("1. Inner item 1"));
assert!(result.contains("2. Inner item 2"));
}
#[test]
fn test_list_with_formatting() {
let html = r"
Bold item
Italic item
Code item
";
let result = convert(html, None).unwrap();
assert!(result.contains("- **Bold** item"));
assert!(result.contains("- *Italic* item"));
assert!(result.contains("- `Code` item"));
}
#[test]
fn test_list_with_links() {
let html = r#""#;
let result = convert(html, None).unwrap();
assert!(result.contains("[Link 1](https://example.com)"));
assert!(result.contains("[Link 2](https://example.org)"));
}
#[test]
fn test_task_list() {
let html = r#""#;
let result = convert(html, None).unwrap();
assert!(result.contains("- [x] Completed task"));
assert!(result.contains("- [ ] Incomplete task"));
}
#[test]
fn test_list_indent_spaces() {
let html = r"";
let options = ConversionOptions {
list_indent_type: html_to_markdown_rs::ListIndentType::Spaces,
list_indent_width: 2,
..Default::default()
};
let result = convert(html, Some(options)).unwrap();
assert!(result.contains("- Parent"));
assert!(result.contains(" * Child"));
}
#[test]
fn test_list_indent_tabs() {
let html = r"";
let options = ConversionOptions {
list_indent_type: html_to_markdown_rs::ListIndentType::Tabs,
..Default::default()
};
let result = convert(html, Some(options)).unwrap();
assert!(result.contains("- Parent"));
assert!(result.contains("\t* Child"));
}
#[test]
fn test_custom_bullet_symbols() {
let html = r"";
let options = ConversionOptions {
bullets: "*+-".to_string(),
..Default::default()
};
let result = convert(html, Some(options)).unwrap();
assert!(result.contains("* Item 1") || result.contains("* Item 2"));
}
#[test]
fn test_empty_list_item() {
let html = r"";
let result = convert(html, None).unwrap();
assert!(result.contains("- Item 1"));
assert!(result.contains("- Item 3"));
}
#[test]
fn test_list_with_code_block() {
let html = r#""#;
let result = convert(html, None).unwrap();
println!("Result:\n{result}");
assert!(result.contains("- Item with code:"));
assert!(result.contains("fn main()"));
}
================================================
FILE: crates/html-to-markdown/tests/plain_output_test.rs
================================================
#![allow(missing_docs)]
fn convert(
html: &str,
opts: Option,
) -> html_to_markdown_rs::error::Result {
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
}
use html_to_markdown_rs::{ConversionOptions, OutputFormat};
fn plain_options() -> ConversionOptions {
ConversionOptions {
output_format: OutputFormat::Plain,
..Default::default()
}
}
#[test]
fn test_plain_basic_paragraph() {
let html = "Hello world
";
let result = convert(html, Some(plain_options())).unwrap();
assert_eq!(result, "Hello world\n");
}
#[test]
fn test_plain_no_strong_markers() {
let html = "This is bold text
";
let result = convert(html, Some(plain_options())).unwrap();
assert_eq!(result, "This is bold text\n");
}
#[test]
fn test_plain_no_emphasis_markers() {
let html = "This is italic text
";
let result = convert(html, Some(plain_options())).unwrap();
assert_eq!(result, "This is italic text\n");
}
#[test]
fn test_plain_link_text_only() {
let html = r#"Visit our site today
"#;
let result = convert(html, Some(plain_options())).unwrap();
assert_eq!(result, "Visit our site today\n");
}
#[test]
fn test_plain_image_alt_text() {
let html = r#" "#;
let result = convert(html, Some(plain_options())).unwrap();
assert_eq!(result, "A cute cat\n");
}
#[test]
fn test_plain_image_skipped_when_option_set() {
let html = r#" "#;
let mut opts = plain_options();
opts.skip_images = true;
let result = convert(html, Some(opts)).unwrap();
assert_eq!(result, "");
}
#[test]
fn test_plain_code_block() {
let html = "fn main() {} ";
let result = convert(html, Some(plain_options())).unwrap();
assert_eq!(result, "fn main() {}\n");
}
#[test]
fn test_plain_blockquote_no_prefix() {
let html = "Quoted text
";
let result = convert(html, Some(plain_options())).unwrap();
assert!(
!result.contains('>'),
"Plain text should not contain blockquote prefix, got: {result}"
);
assert!(result.contains("Quoted text"));
}
#[test]
fn test_plain_list_items_on_separate_lines() {
let html = "";
let result = convert(html, Some(plain_options())).unwrap();
assert_eq!(result, "- First\n- Second\n- Third\n");
}
#[test]
fn test_plain_table_cells_extracted() {
let html = "";
let result = convert(html, Some(plain_options())).unwrap();
assert!(result.contains('A'));
assert!(result.contains('B'));
assert!(result.contains('C'));
assert!(result.contains('D'));
}
#[test]
fn test_plain_no_escaping() {
let html = "* not a list
";
let result = convert(html, Some(plain_options())).unwrap();
assert!(
result.contains("* not a list"),
"Plain text should not escape asterisks, got: {result}"
);
assert!(
!result.contains("\\*"),
"Plain text should not backslash-escape, got: {result}"
);
}
#[test]
fn test_plain_script_excluded() {
let html = "Before
After
";
let result = convert(html, Some(plain_options())).unwrap();
assert!(
!result.contains("alert"),
"Script content should be excluded, got: {result}"
);
assert!(result.contains("Before"));
assert!(result.contains("After"));
}
#[test]
fn test_plain_style_excluded() {
let html = "Hello
";
let result = convert(html, Some(plain_options())).unwrap();
assert!(
!result.contains("color"),
"Style content should be excluded, got: {result}"
);
assert!(result.contains("Hello"));
}
#[test]
fn test_plain_br_becomes_newline() {
let html = "Line one Line two
";
let result = convert(html, Some(plain_options())).unwrap();
assert!(
result.contains("Line one\nLine two"),
"Expected newline from , got: {result}"
);
}
#[test]
fn test_plain_hr_becomes_blank_line() {
let html = "Above
Below
";
let result = convert(html, Some(plain_options())).unwrap();
assert!(result.contains("Above"));
assert!(result.contains("Below"));
// Should have blank line between
assert!(result.contains("\n\n"), "Expected blank line from , got: {result}");
}
#[test]
fn test_plain_nested_inline_formatting_stripped() {
let html = "Start bold and italic end
";
let result = convert(html, Some(plain_options())).unwrap();
assert_eq!(result, "Start bold and italic end\n");
}
#[test]
fn test_plain_heading_no_markers() {
let html = "Title Content
";
let result = convert(html, Some(plain_options())).unwrap();
assert!(
!result.contains('#'),
"Plain text should not contain heading markers, got: {result}"
);
assert!(result.contains("Title"));
assert!(result.contains("Content"));
}
#[test]
fn test_plain_parse_variants() {
assert_eq!(OutputFormat::parse("plain"), OutputFormat::Plain);
assert_eq!(OutputFormat::parse("plaintext"), OutputFormat::Plain);
assert_eq!(OutputFormat::parse("text"), OutputFormat::Plain);
assert_eq!(OutputFormat::parse("Plain"), OutputFormat::Plain);
assert_eq!(OutputFormat::parse("PLAINTEXT"), OutputFormat::Plain);
}
#[test]
fn test_plain_empty_input() {
let html = "";
let result = convert(html, Some(plain_options())).unwrap();
assert_eq!(result, "");
}
#[test]
fn test_plain_whitespace_only_html() {
let html = "
";
let result = convert(html, Some(plain_options())).unwrap();
assert_eq!(result, "");
}
#[test]
fn test_plain_inline_code_no_backticks() {
let html = "Use fmt.Println to print
";
let result = convert(html, Some(plain_options())).unwrap();
assert!(
!result.contains('`'),
"Plain text should not contain backticks, got: {result}"
);
assert!(result.contains("fmt.Println"));
}
#[test]
fn test_plain_pre_preserves_whitespace() {
let html = " indented\n more ";
let result = convert(html, Some(plain_options())).unwrap();
assert!(
result.contains(" indented\n more"),
"Pre blocks should preserve whitespace, got: {result}"
);
}
#[test]
fn test_plain_unordered_list_markers() {
let html = "";
let result = convert(html, Some(plain_options())).unwrap();
assert_eq!(result, "- Alpha\n- Beta\n- Gamma\n");
}
#[test]
fn test_plain_ordered_list_markers() {
let html = "First Second Third ";
let result = convert(html, Some(plain_options())).unwrap();
assert_eq!(result, "1. First\n2. Second\n3. Third\n");
}
#[test]
fn test_plain_ordered_list_custom_start() {
let html = r#"First item starting at 42 Second item "#;
let result = convert(html, Some(plain_options())).unwrap();
assert_eq!(result, "42. First item starting at 42\n43. Second item\n");
}
#[test]
fn test_plain_nested_lists() {
let html = "";
let result = convert(html, Some(plain_options())).unwrap();
// The outer items should have `- ` prefix and inner items should also have `- ` prefix
assert!(
result.contains("- Outer 1"),
"Expected '- Outer 1' in output, got: {result}"
);
assert!(
result.contains("- Inner A"),
"Expected '- Inner A' in output, got: {result}"
);
assert!(
result.contains("- Inner B"),
"Expected '- Inner B' in output, got: {result}"
);
assert!(
result.contains("- Outer 2"),
"Expected '- Outer 2' in output, got: {result}"
);
}
#[test]
fn test_plain_ordered_list_inside_unordered() {
let html = "";
let result = convert(html, Some(plain_options())).unwrap();
assert!(
result.contains("- Bullet"),
"Expected '- Bullet' in output, got: {result}"
);
assert!(
result.contains("1. Numbered"),
"Expected '1. Numbered' in output, got: {result}"
);
}
================================================
FILE: crates/html-to-markdown/tests/preprocessing_tests.rs
================================================
#![allow(missing_docs)]
fn convert(
html: &str,
opts: Option,
) -> html_to_markdown_rs::error::Result {
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
}
use html_to_markdown_rs::ConversionOptions;
#[test]
fn footer_without_navigation_hint_is_preserved() {
let html = r#"
Simple Webpage
This is a simple webpage without external images.
"#;
let markdown = convert(html, None).unwrap();
assert!(
markdown.contains("Test page for processors validation"),
"footer content should be retained in markdown:\n{markdown}"
);
}
#[test]
fn footer_with_navigation_hint_is_removed() {
let html = r#"
Simple Webpage
"#;
let options = ConversionOptions {
preprocessing: html_to_markdown_rs::PreprocessingOptions {
enabled: true,
..Default::default()
},
..Default::default()
};
let markdown = convert(html, Some(options)).unwrap();
assert!(
!markdown.contains("processors validation"),
"navigational footers should still be stripped entirely:\n{markdown}"
);
}
================================================
FILE: crates/html-to-markdown/tests/reference_links_test.rs
================================================
#![allow(missing_docs)]
use html_to_markdown_rs::{ConversionOptions, LinkStyle};
fn convert(html: &str, options: Option) -> String {
html_to_markdown_rs::convert(html, options)
.unwrap()
.content
.unwrap_or_default()
}
fn ref_options() -> ConversionOptions {
ConversionOptions {
link_style: LinkStyle::Reference,
..Default::default()
}
}
#[test]
fn basic_reference_link() {
let html = r#"Click here "#;
let result = convert(html, Some(ref_options()));
assert!(
result.contains("[Click here][1]"),
"Expected reference-style link, got: {result}"
);
assert!(
result.contains("[1]: https://example.com"),
"Expected reference definition, got: {result}"
);
}
#[test]
fn reference_link_with_title() {
let html = r#"Click "#;
let result = convert(html, Some(ref_options()));
assert!(
result.contains("[Click][1]"),
"Expected reference-style link, got: {result}"
);
assert!(
result.contains(r#"[1]: https://example.com "Example""#),
"Expected reference definition with title, got: {result}"
);
}
#[test]
fn url_deduplication() {
let html = r#"First Second "#;
let result = convert(html, Some(ref_options()));
assert!(
result.contains("[First][1]"),
"Expected first link with ref 1, got: {result}"
);
assert!(
result.contains("[Second][1]"),
"Expected second link reusing ref 1, got: {result}"
);
// Should only have one definition
let count = result.matches("[1]: https://example.com").count();
assert_eq!(count, 1, "Expected exactly one definition, got: {result}");
}
#[test]
fn different_titles_different_refs() {
let html =
r#"First Second "#;
let result = convert(html, Some(ref_options()));
assert!(
result.contains("[First][1]"),
"Expected first link ref 1, got: {result}"
);
assert!(
result.contains("[Second][2]"),
"Expected second link ref 2 (different title), got: {result}"
);
}
#[test]
fn image_reference_style() {
let html = r#" "#;
let result = convert(html, Some(ref_options()));
assert!(
result.contains("![A photo][1]"),
"Expected reference-style image, got: {result}"
);
assert!(
result.contains("[1]: https://example.com/img.png"),
"Expected image reference definition, got: {result}"
);
}
#[test]
fn mixed_links_and_images_share_numbering() {
let html = r#"Link "#;
let result = convert(html, Some(ref_options()));
assert!(result.contains("[Link][1]"), "Expected link as ref 1, got: {result}");
assert!(result.contains("![Img][2]"), "Expected image as ref 2, got: {result}");
}
#[test]
fn autolinks_unaffected() {
let html = r#"https://example.com "#;
let options = ConversionOptions {
link_style: LinkStyle::Reference,
autolinks: true,
..Default::default()
};
let result = convert(html, Some(options));
// Autolinks should still render as
assert!(
result.contains(""),
"Autolinks should not be affected by reference style, got: {result}"
);
}
#[test]
fn default_inline_unchanged() {
let html = r#"Click "#;
let result = convert(html, None);
assert!(
result.contains("[Click](https://example.com)"),
"Default should use inline style, got: {result}"
);
}
#[test]
fn multiple_paragraphs_references_at_end() {
let html = r#"