Repository: mgdm/htmlq
Branch: master
Commit: 6e31bc814332
Files: 10
Total size: 27.2 KB
Directory structure:
gitextract_o4u0r91v/
├── .github/
│ └── workflows/
│ └── build.yml
├── .gitignore
├── Cargo.toml
├── LICENSE.md
├── README.md
├── flake.nix
├── src/
│ ├── link.rs
│ ├── main.rs
│ └── pretty_print.rs
└── tests/
└── cli.rs
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/workflows/build.yml
================================================
name: Build binaries
on:
release:
types: [published]
jobs:
release:
permissions:
contents: write
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
- uses: actions/cache@v2
with:
path: |
~/.cargo/bin/
~/.cargo/registry/index/
~/.cargo/registry/cache/
~/.cargo/git/db/
target/
key: ${{ runner.os }}-cargo-${{ hashFiles('Cargo.lock') }}
- name: Build release
run: cargo build --release
- name: Archive as .tar.gz (Linux)
if: matrix.os == 'ubuntu-latest'
run: tar cfz htmlq-x86_64-linux.tar.gz -C target/release htmlq
- name: Archive as .tar.gz (macOS)
if: matrix.os == 'macos-latest'
run: tar cfz htmlq-x86_64-darwin.tar.gz -C target/release htmlq
- name: Archive as .zip (Windows)
if: matrix.os == 'windows-latest'
shell: bash
run: 7z a -tzip -mm=Deflate htmlq-x86_64-windows.zip ./target/release/htmlq.exe
- name: Publish
uses: softprops/action-gh-release@v1
with:
files: |
htmlq*.tar.gz
htmlq*.zip
================================================
FILE: .gitignore
================================================
/target
**/*.rs.bk
================================================
FILE: Cargo.toml
================================================
[package]
name = "htmlq"
description = "Like jq, but for HTML."
categories = ["command-line-utilities"]
keywords = ["CSS", "HTML", "query"]
repository = "https://github.com/mgdm/htmlq"
documentation = "https://github.com/mgdm/htmlq/blob/master/README.md"
readme = "README.md"
license = "MIT"
license-file = "LICENSE.md"
version = "0.4.0"
authors = ["Michael Maclean <michael@mgdm.net>"]
edition = "2021"
exclude = ["/.github"]
[dependencies]
kuchiki = "0.8.1"
html5ever = "0.25.1"
clap = "2.33.3"
lazy_static = "1.4.0"
url = "2.2.2"
[dev-dependencies]
assert_cmd = "2.0"
predicates = "2.1"
================================================
FILE: LICENSE.md
================================================
MIT License
Copyright (c) 2019 Michael Maclean
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# htmlq
Like [`jq`](https://stedolan.github.io/jq/), but for HTML. Uses [CSS selectors](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors) to extract bits of content from HTML files.
## Installation
### [Cargo](https://crates.io/crates/htmlq)
```sh
cargo install htmlq
```
### [FreeBSD pkg](https://www.freshports.org/textproc/htmlq)
```sh
pkg install htmlq
```
### [Homebrew](https://formulae.brew.sh/formula/htmlq)
```sh
brew install htmlq
```
### [Scoop](https://scoop.sh/)
```sh
scoop install htmlq
```
## Usage
```console
$ htmlq -h
htmlq 0.4.0
Michael Maclean <michael@mgdm.net>
Runs CSS selectors on HTML
USAGE:
htmlq [FLAGS] [OPTIONS] [--] [selector]...
FLAGS:
-B, --detect-base Try to detect the base URL from the <base> tag in the document. If not found, default to
the value of --base, if supplied
-h, --help Prints help information
-w, --ignore-whitespace When printing text nodes, ignore those that consist entirely of whitespace
-p, --pretty Pretty-print the serialised output
-t, --text Output only the contents of text nodes inside selected elements
-V, --version Prints version information
OPTIONS:
-a, --attribute <attribute> Only return this attribute (if present) from selected elements
-b, --base <base> Use this URL as the base for links
-f, --filename <FILE> The input file. Defaults to stdin
-o, --output <FILE> The output file. Defaults to stdout
-r, --remove-nodes <SELECTOR>... Remove nodes matching this expression before output. May be specified multiple
times
ARGS:
<selector>... The CSS expression to select [default: html]
$
```
## Examples
### Using with cURL to find part of a page by ID
```console
$ curl --silent https://www.rust-lang.org/ | htmlq '#get-help'
<div class="four columns mt3 mt0-l" id="get-help">
<h4>Get help!</h4>
<ul>
<li><a href="https://doc.rust-lang.org">Documentation</a></li>
<li><a href="https://users.rust-lang.org">Ask a Question on the Users Forum</a></li>
<li><a href="http://ping.rust-lang.org">Check Website Status</a></li>
</ul>
<div class="languages">
<label class="hidden" for="language-footer">Language</label>
<select id="language-footer">
<option title="English (US)" value="en-US">English (en-US)</option>
<option title="French" value="fr">Français (fr)</option>
<option title="German" value="de">Deutsch (de)</option>
</select>
</div>
</div>
```
### Find all the links in a page
```console
$ curl --silent https://www.rust-lang.org/ | htmlq --attribute href a
/
/tools/install
/learn
/tools
/governance
/community
https://blog.rust-lang.org/
/learn/get-started
https://blog.rust-lang.org/2019/04/25/Rust-1.34.1.html
https://blog.rust-lang.org/2018/12/06/Rust-1.31-and-rust-2018.html
[...]
```
### Get the text content of a post
```console
$ curl --silent https://nixos.org/nixos/about.html | htmlq --text .main
About NixOS
NixOS is a GNU/Linux distribution that aims to
improve the state of the art in system configuration management. In
existing distributions, actions such as upgrades are dangerous:
upgrading a package can cause other packages to break, upgrading an
entire system is much less reliable than reinstalling from scratch,
you can’t safely test what the results of a configuration change will
be, you cannot easily undo changes to the system, and so on. We want
to change that. NixOS has many innovative features:
[...]
```
### Remove a node before output
There's a big SVG image in this page that I don't need, so here's how to remove it.
```console
$ curl --silent https://nixos.org/ | ./target/debug/htmlq '.whynix' --remove-nodes svg
<ul class="whynix">
<li>
<h2>Reproducible</h2>
<p>
Nix builds packages in isolation from each other. This ensures that they
are reproducible and don't have undeclared dependencies, so <strong>if a
package works on one machine, it will also work on another</strong>.
</p>
</li>
<li>
<h2>Declarative</h2>
<p>
Nix makes it <strong>trivial to share development and build
environments</strong> for your projects, regardless of what programming
languages and tools you’re using.
</p>
</li>
<li>
<h2>Reliable</h2>
<p>
Nix ensures that installing or upgrading one package <strong>cannot
break other packages</strong>. It allows you to <strong>roll back to
previous versions</strong>, and ensures that no package is in an
inconsistent state during an upgrade.
</p>
</li>
</ul>
```
### Pretty print HTML
(This is a bit of a work in progress)
```console
$ curl --silent https://mgdm.net | htmlq --pretty '#posts'
<section id="posts">
<h2>I write about...
</h2>
<ul class="post-list">
<li>
<time datetime="2019-04-29 00:%i:1556496000" pubdate="">
29/04/2019</time><a href="/weblog/nettop/">
<h3>Debugging network connections on macOS with nettop
</h3></a>
<p>Using nettop to find out what network connections a program is trying to make.
</p>
</li>
[...]
```
### Syntax highlighting with [`bat`](https://github.com/sharkdp/bat)
```console
$ curl --silent example.com | htmlq 'body' | bat --language html
```
> <img alt="Syntax highlighted output" width="700" src="https://user-images.githubusercontent.com/2346707/132808980-db8991ff-9177-4cb7-a018-39ad94282374.png" />
================================================
FILE: flake.nix
================================================
{
description = "like jq, but for HTML.";
inputs = {
nixpkgs.url = "nixpkgs"; # Resolves to github:NixOS/nixpkgs
# Helpers for system-specific outputs
flake-utils.url = "github:numtide/flake-utils";
crate2nix = {
url = "github:kolloch/crate2nix";
flake = false;
};
};
outputs = { self, nixpkgs, crate2nix, flake-utils }:
# Create system-specific outputs for the standard Nix systems
# https://github.com/numtide/flake-utils/blob/master/default.nix#L3-L9
flake-utils.lib.eachDefaultSystem (system:
let
pkgs = nixpkgs.legacyPackages.${system};
crateName = "htmlq";
inherit (import "${crate2nix}/tools.nix" { inherit pkgs; })
generatedCargoNix;
project = pkgs.callPackage (generatedCargoNix {
name = crateName;
src = ./.;
}) {
defaultCrateOverrides = pkgs.defaultCrateOverrides // {
# Crate dependency overrides go here
};
};
in {
packages.${crateName} = project.rootCrate.build;
defaultPackage = self.packages.${system}.${crateName};
devShell = pkgs.mkShell {
inputsFrom = builtins.attrValues self.packages.${system};
buildInputs = [ pkgs.cargo pkgs.rust-analyzer pkgs.clippy ];
};
});
}
================================================
FILE: src/link.rs
================================================
use html5ever::local_name;
use kuchiki::NodeRef;
use url::Url;
pub fn rewrite_relative_url(node: &NodeRef, base: &Url) {
let Some(elem) = node.as_element() else {
return
};
if !(local_name!("a") == elem.name.local
|| local_name!("link") == elem.name.local
|| local_name!("area") == elem.name.local)
{
return;
};
let mut attrs = elem.attributes.borrow_mut();
if attrs.contains("href") {
let Some(url) = attrs.get_mut("href") else {
return
};
if url.starts_with("////") {
*url = url.trim_start_matches('/').to_string();
return;
}
let new_url = base.join(url).ok().unwrap_or_else(|| base.to_owned());
attrs.insert("href", new_url.to_string());
}
}
pub fn detect_base(document: &NodeRef) -> Option<Url> {
let Ok(node) = document.select_first("base") else {
return None
};
let attrs = node.attributes.borrow();
if attrs.contains("href") {
let href = attrs
.get("href")
.expect("should have retrieved href from node attributes");
return match Url::parse(href) {
Ok(url) => Some(url),
_ => None,
};
}
None
}
#[cfg(test)]
mod tests {
use html5ever::tendril::TendrilSink;
use super::*;
macro_rules! rewrite_tests {
($($name:ident: $value:expr,)*) => {
$(
#[test]
fn $name() {
let (mut input, expected) = $value;
let base = Url::parse("https://mgdm.net").unwrap();
let doc = make_doc(&mut input);
for css_match in doc
.select("a, area, link")
.expect("Failed to parse CSS selector while doing link rewriting")
{
let node = css_match.as_node();
rewrite_relative_url(&node, &base);
}
let result = serialize_doc(&doc);
assert_eq!(expected, result);
}
)*
}
}
macro_rules! detect_base_tests {
($($name:ident: $value:expr,)*) => {
$(
#[test]
fn $name() {
let (mut input, expected) = $value;
let doc = make_doc(&mut input);
let result = detect_base(&doc);
assert_eq!(expected, result);
}
)*
}
}
fn make_doc(html: &mut String) -> NodeRef {
kuchiki::parse_html()
.from_utf8()
.read_from(&mut html.as_bytes())
.unwrap()
}
fn serialize_doc(doc: &NodeRef) -> String {
let mut content: Vec<u8> = Vec::new();
doc.serialize(&mut content).unwrap();
std::str::from_utf8(&content).unwrap().to_string()
}
rewrite_tests! {
rewrite_a_href: (
"<html><head></head><body><a href=\"/foo/bar\">Hello</a></body></html>".to_string(),
"<html><head></head><body><a href=\"https://mgdm.net/foo/bar\">Hello</a></body></html>".to_string(),
),
rewrite_link_href: (
"<html><head><link href=\"/style.css\" rel=\"stylesheet\"/></head><body>Hello</body></html>".to_string(),
"<html><head><link href=\"https://mgdm.net/style.css\" rel=\"stylesheet\"></head><body>Hello</body></html>".to_string(),
),
rewrite_map_area_href: (
"<html><head></head><body><map name=\"primary\"><area coords=\"75,75,75\" href=\"left.html\" shape=\"circle\"></map></body></html>".to_string(),
"<html><head></head><body><map name=\"primary\"><area coords=\"75,75,75\" href=\"https://mgdm.net/left.html\" shape=\"circle\"></map></body></html>".to_string()
),
do_not_rewrite_absolute_url: (
"<html><head></head><body><a href=\"https://example.org/foo/bar\">Hello</a></body></html>".to_string(),
"<html><head></head><body><a href=\"https://example.org/foo/bar\">Hello</a></body></html>".to_string(),
),
}
detect_base_tests! {
base_ok: (
"<html><head><base href=\"https://example.org\"></head><body><a href=\"https://example.org/foo/bar\">Hello</a></body></html>".to_string(),
Some(Url::parse("https://example.org").unwrap())
),
base_not_found: (
"<html><head></head><body><a href=\"https://example.org/foo/bar\">Hello</a></body></html>".to_string(),
None
),
}
}
================================================
FILE: src/main.rs
================================================
extern crate html5ever;
extern crate kuchiki;
#[macro_use]
extern crate lazy_static;
mod link;
mod pretty_print;
use clap::{App, Arg, ArgMatches};
use kuchiki::traits::*;
use kuchiki::NodeRef;
use std::borrow::BorrowMut;
use std::error::Error;
use std::fs::File;
use std::io;
use std::str;
use url::Url;
#[derive(Debug, Clone)]
struct Config {
input_path: String,
output_path: String,
selector: String,
base: Option<String>,
detect_base: bool,
text_only: bool,
ignore_whitespace: bool,
pretty_print: bool,
remove_nodes: Option<Vec<String>>,
attributes: Option<Vec<String>>,
}
impl Config {
fn from_args(matches: ArgMatches) -> Option<Config> {
let attributes = matches
.values_of("attribute")
.map(|values| values.map(String::from).collect());
let remove_nodes = matches
.values_of("remove_nodes")
.map(|values| values.map(String::from).collect());
let selector: String = match matches.values_of("selector") {
Some(values) => values.collect::<Vec<&str>>().join(" "),
None => String::from("html"),
};
let base = matches.value_of("base").map(|b| b.to_owned());
Some(Config {
input_path: String::from(matches.value_of("filename").unwrap_or("-")),
output_path: String::from(matches.value_of("output").unwrap_or("-")),
base,
detect_base: matches.is_present("detect_base"),
text_only: matches.is_present("text_only"),
ignore_whitespace: matches.is_present("ignore_whitespace"),
pretty_print: matches.is_present("pretty_print"),
remove_nodes,
attributes,
selector,
})
}
}
impl Default for Config {
fn default() -> Self {
Self {
input_path: "-".to_string(),
output_path: "-".to_string(),
selector: "html".to_string(),
base: None,
detect_base: false,
ignore_whitespace: true,
pretty_print: true,
text_only: false,
remove_nodes: None,
attributes: Some(vec![]),
}
}
}
fn select_attributes(node: &NodeRef, attributes: &[String], output: &mut dyn io::Write) {
if let Some(as_element) = node.as_element() {
for attr in attributes {
if let Ok(elem_atts) = as_element.attributes.try_borrow() {
if let Some(val) = elem_atts.get(attr.as_str()) {
writeln!(output, "{}", val).ok();
}
}
}
}
}
fn serialize_text(node: &NodeRef, ignore_whitespace: bool) -> String {
let mut result = String::new();
for text_node in node.inclusive_descendants().text_nodes() {
if ignore_whitespace && text_node.borrow().trim().is_empty() {
continue;
}
result.push_str(&text_node.borrow());
if ignore_whitespace {
result.push('\n');
}
}
result
}
fn get_config<'a, 'b>() -> App<'a, 'b> {
App::new("htmlq")
.version("0.4.0")
.author("Michael Maclean <michael@mgdm.net>")
.about("Runs CSS selectors on HTML")
.arg(
Arg::with_name("filename")
.short("f")
.long("filename")
.value_name("FILE")
.help("The input file. Defaults to stdin")
.takes_value(true),
)
.arg(
Arg::with_name("output")
.short("o")
.long("output")
.value_name("FILE")
.help("The output file. Defaults to stdout")
.takes_value(true),
)
.arg(
Arg::with_name("pretty_print")
.short("p")
.long("pretty")
.help("Pretty-print the serialised output"),
)
.arg(
Arg::with_name("text_only")
.short("t")
.long("text")
.help("Output only the contents of text nodes inside selected elements"),
)
.arg(
Arg::with_name("ignore_whitespace")
.short("w")
.long("ignore-whitespace")
.help("When printing text nodes, ignore those that consist entirely of whitespace"),
)
.arg(
Arg::with_name("attribute")
.short("a")
.long("attribute")
.takes_value(true)
.help("Only return this attribute (if present) from selected elements"),
)
.arg(
Arg::with_name("base")
.short("b")
.long("base")
.takes_value(true)
.help("Use this URL as the base for links"),
)
.arg(
Arg::with_name("detect_base")
.short("B")
.long("detect-base")
.help("Try to detect the base URL from the <base> tag in the document. If not found, default to the value of --base, if supplied"),
)
.arg(
Arg::with_name("remove_nodes")
.long("remove-nodes")
.short("r")
.multiple(true)
.number_of_values(1)
.takes_value(true)
.value_name("SELECTOR")
.help("Remove nodes matching this expression before output. May be specified multiple times")
)
.arg(
Arg::with_name("selector")
.default_value("html")
.multiple(true)
.help("The CSS expression to select"),
)
}
fn main() -> Result<(), Box<dyn Error>> {
let config = get_config();
let matches = config.get_matches();
let config = Config::from_args(matches).unwrap_or_default();
let mut input: Box<dyn io::Read> = match config.input_path.as_ref() {
"-" => Box::new(std::io::stdin()),
f => Box::new(File::open(f).expect("should have opened input file")),
};
let stdout = std::io::stdout();
let mut output: Box<dyn io::Write> = match config.output_path.as_ref() {
"-" => Box::new(stdout.lock()),
f => Box::new(File::create(f).expect("should have created output file")),
};
let document = kuchiki::parse_html().from_utf8().read_from(&mut input)?;
let base: Option<Url> = match (&config.base, &config.detect_base) {
(Some(base), true) => link::detect_base(&document).or(Url::parse(&base).ok()),
(Some(base), false) => Url::parse(&base).ok(),
(None, true) => link::detect_base(&document),
_ => None,
};
let remove_node_selector = match config.remove_nodes {
Some(ref remove_node_selectors) => remove_node_selectors.join(","),
None => Default::default(),
};
document
.select(&config.selector)
.expect("Failed to parse CSS selector")
.filter(|noderef| {
if let Ok(mut node) = noderef.as_node().select_first(&remove_node_selector) {
node.borrow_mut().as_node().detach();
false
} else {
true
}
})
.map(|node| {
if let Some(base) = &base {
link::rewrite_relative_url(node.as_node(), &base)
}
node
})
.for_each(|matched_noderef| {
let node = matched_noderef.as_node();
if let Some(attributes) = &config.attributes {
select_attributes(node, attributes, &mut output);
return;
}
if config.text_only {
// let content = serialize_text(node, config.ignore_whitespace);
// output.write_all(format!("{}\n", content).as_ref()).ok();
writeln!(output, "{}", serialize_text(node, config.ignore_whitespace)).ok();
return;
}
if config.pretty_print {
// let content = pretty_print::pretty_print(node);
// output.write_all(content.as_ref()).ok();
writeln!(output, "{}", pretty_print::pretty_print(node)).ok();
return;
}
writeln!(output, "{}", node.to_string()).ok();
// let mut content: Vec<u8> = Vec::new();
// let Ok(_) = node.serialize(&mut content) else {
// return
// };
// output.write_all(format!("{}\n", content).as_ref()).ok();
});
Ok(())
}
================================================
FILE: src/pretty_print.rs
================================================
use html5ever::serialize::AttrRef;
use html5ever::serialize::HtmlSerializer;
use html5ever::serialize::Serialize;
use html5ever::serialize::SerializeOpts;
use html5ever::serialize::Serializer;
use html5ever::serialize::TraversalScope;
use html5ever::QualName;
// use kuchiki::traits::TendrilSink;
use kuchiki::NodeRef;
use std::collections::HashSet;
use std::io;
use std::io::Write;
use std::str;
lazy_static! {
static ref INLINE_ELEMENTS: HashSet<&'static str> = vec![
"a", "abbr", "acronym", "audio", "b", "bdi", "bdo", "big", "button", "canvas", "cite",
"code", "data", "datalist", "del", "dfn", "em", "embed", "i", "iframe", "img", "input",
"ins", "kbd", "label", "map", "mark", "meter", "noscript", "object", "output", "picture",
"progress", "q", "ruby", "s", "samp", "script", "select", "slot", "small", "span",
"strong", "sub", "sup", "svg", "template", "textarea", "time", "u", "tt", "var", "video",
"wbr",
]
.into_iter()
.collect();
}
fn is_inline(name: &str) -> bool {
INLINE_ELEMENTS.contains(name)
}
struct PrettyPrint<W: Write> {
indent: usize,
previous_was_block: bool,
inner: HtmlSerializer<W>,
}
impl<W: Write> Serializer for PrettyPrint<W> {
fn start_elem<'a, AttrIter>(&mut self, name: QualName, attrs: AttrIter) -> io::Result<()>
where
AttrIter: Iterator<Item = AttrRef<'a>>,
{
let inline = is_inline(&name.local);
if !inline || self.previous_was_block {
self.inner.writer.write_all(b"\n")?;
self.inner.writer.write_all(&vec![b' '; self.indent])?;
}
self.indent += 2;
self.inner.start_elem(name, attrs)?;
Ok(())
}
fn end_elem(&mut self, name: QualName) -> io::Result<()> {
self.indent -= 2;
if is_inline(&name.local) {
self.previous_was_block = false;
} else {
self.inner.writer.write_all(b"\n")?;
self.inner.writer.write_all(&vec![b' '; self.indent])?;
self.previous_was_block = true;
}
self.inner.end_elem(name)
}
fn write_text(&mut self, text: &str) -> io::Result<()> {
if text.trim().is_empty() {
Ok(())
} else {
if self.previous_was_block {
self.inner.writer.write_all(b"\n")?;
self.inner.writer.write_all(&vec![b' '; self.indent])?;
}
self.previous_was_block = false;
self.inner.write_text(text)
}
}
fn write_comment(&mut self, text: &str) -> io::Result<()> {
self.inner.write_comment(text)
}
fn write_doctype(&mut self, name: &str) -> io::Result<()> {
self.inner.write_doctype(name)
}
fn write_processing_instruction(&mut self, target: &str, data: &str) -> io::Result<()> {
self.inner.write_processing_instruction(target, data)
}
}
pub fn pretty_print(node: &NodeRef) -> String {
let mut content: Vec<u8> = Vec::new();
let mut pp = PrettyPrint {
indent: 0,
previous_was_block: false,
inner: HtmlSerializer::new(
&mut content,
SerializeOpts {
traversal_scope: TraversalScope::IncludeNode,
..Default::default()
},
),
};
Serialize::serialize(node, &mut pp, TraversalScope::IncludeNode).unwrap();
str::from_utf8(content.as_ref()).unwrap().to_owned()
}
================================================
FILE: tests/cli.rs
================================================
use assert_cmd::Command;
use predicates::prelude::*;
macro_rules! cmd_success_tests {
($($name:ident: $value:expr,)*) => {
$(
#[test]
fn $name(){
let (stdin, args, expected) = $value;
Command::cargo_bin("htmlq")
.unwrap()
.args(args)
.write_stdin(stdin)
.assert()
.success()
.stdout(predicate::str::diff(expected));
}
)*
}
}
cmd_success_tests!(
find_by_class: (
"<html><head></head><body><div class=\"hi\"><a href=\"/foo/bar\">Hello</a></div></body></html>",
[".hi"],
"<div class=\"hi\"><a href=\"/foo/bar\">Hello</a></div>\n"
),
find_by_id: (
"<html><head></head><body><div id=\"my-id\"><a href=\"/foo/bar\">Hello</a></div></body></html>",
["#my-id"],
"<div id=\"my-id\"><a href=\"/foo/bar\">Hello</a></div>\n"
),
remove_links: (
"<html><head></head><body><div id=\"my-id\"><a href=\"/foo/bar\">Hello</a></div></body></html>",
["#my-id", "--remove-nodes", "a"],
"<div id=\"my-id\"></div>\n",
),
);
gitextract_o4u0r91v/
├── .github/
│ └── workflows/
│ └── build.yml
├── .gitignore
├── Cargo.toml
├── LICENSE.md
├── README.md
├── flake.nix
├── src/
│ ├── link.rs
│ ├── main.rs
│ └── pretty_print.rs
└── tests/
└── cli.rs
SYMBOL INDEX (20 symbols across 3 files)
FILE: src/link.rs
function rewrite_relative_url (line 5) | pub fn rewrite_relative_url(node: &NodeRef, base: &Url) {
function detect_base (line 30) | pub fn detect_base(document: &NodeRef) -> Option<Url> {
function make_doc (line 93) | fn make_doc(html: &mut String) -> NodeRef {
function serialize_doc (line 100) | fn serialize_doc(doc: &NodeRef) -> String {
FILE: src/main.rs
type Config (line 21) | struct Config {
method from_args (line 35) | fn from_args(matches: ArgMatches) -> Option<Config> {
method default (line 67) | fn default() -> Self {
function select_attributes (line 83) | fn select_attributes(node: &NodeRef, attributes: &[String], output: &mut...
function serialize_text (line 95) | fn serialize_text(node: &NodeRef, ignore_whitespace: bool) -> String {
function get_config (line 112) | fn get_config<'a, 'b>() -> App<'a, 'b> {
function main (line 189) | fn main() -> Result<(), Box<dyn Error>> {
FILE: src/pretty_print.rs
function is_inline (line 28) | fn is_inline(name: &str) -> bool {
type PrettyPrint (line 32) | struct PrettyPrint<W: Write> {
method start_elem (line 39) | fn start_elem<'a, AttrIter>(&mut self, name: QualName, attrs: AttrIter) ...
method end_elem (line 55) | fn end_elem(&mut self, name: QualName) -> io::Result<()> {
method write_text (line 69) | fn write_text(&mut self, text: &str) -> io::Result<()> {
method write_comment (line 83) | fn write_comment(&mut self, text: &str) -> io::Result<()> {
method write_doctype (line 87) | fn write_doctype(&mut self, name: &str) -> io::Result<()> {
method write_processing_instruction (line 91) | fn write_processing_instruction(&mut self, target: &str, data: &str) -> ...
function pretty_print (line 96) | pub fn pretty_print(node: &NodeRef) -> String {
Condensed preview — 10 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (30K chars).
[
{
"path": ".github/workflows/build.yml",
"chars": 1346,
"preview": "name: Build binaries\n\non:\n release:\n types: [published]\n\njobs:\n release:\n permissions:\n contents: write\n "
},
{
"path": ".gitignore",
"chars": 19,
"preview": "/target\n**/*.rs.bk\n"
},
{
"path": "Cargo.toml",
"chars": 592,
"preview": "[package]\nname = \"htmlq\"\ndescription = \"Like jq, but for HTML.\"\ncategories = [\"command-line-utilities\"]\nkeywords = [\"CSS"
},
{
"path": "LICENSE.md",
"chars": 1072,
"preview": "MIT License\n\nCopyright (c) 2019 Michael Maclean\n\nPermission is hereby granted, free of charge, to any person obtaining a"
},
{
"path": "README.md",
"chars": 5826,
"preview": "# htmlq\nLike [`jq`](https://stedolan.github.io/jq/), but for HTML. Uses [CSS selectors](https://developer.mozilla.org/en"
},
{
"path": "flake.nix",
"chars": 1324,
"preview": "{\n description = \"like jq, but for HTML.\";\n\n inputs = {\n nixpkgs.url = \"nixpkgs\"; # Resolves to github:NixOS/nixpkg"
},
{
"path": "src/link.rs",
"chars": 4526,
"preview": "use html5ever::local_name;\nuse kuchiki::NodeRef;\nuse url::Url;\n\npub fn rewrite_relative_url(node: &NodeRef, base: &Url) "
},
{
"path": "src/main.rs",
"chars": 8585,
"preview": "extern crate html5ever;\nextern crate kuchiki;\n\n#[macro_use]\nextern crate lazy_static;\n\nmod link;\nmod pretty_print;\n\nuse "
},
{
"path": "src/pretty_print.rs",
"chars": 3453,
"preview": "use html5ever::serialize::AttrRef;\nuse html5ever::serialize::HtmlSerializer;\nuse html5ever::serialize::Serialize;\nuse ht"
},
{
"path": "tests/cli.rs",
"chars": 1157,
"preview": "use assert_cmd::Command;\nuse predicates::prelude::*;\n\nmacro_rules! cmd_success_tests {\n ($($name:ident: $value:expr,)"
}
]
About this extraction
This page contains the full source code of the mgdm/htmlq GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 10 files (27.2 KB), approximately 7.1k tokens, and a symbol index with 20 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.