Each Node is still the same DOM connected Node.
@return a new ArrayList containing the nodes in this list
@see #Nodes(List)
*/
public ArrayList asList() {
return new ArrayList<>(this);
}
/**
Remove each matched node from the DOM.
The nodes will still be retained in this list, in case further processing of them is desired.
E.g. HTML: {@code
Hello
there
![]()
}
doc.select("p").remove();
HTML = {@code ![]()
}
Note that this method should not be used to clean user-submitted HTML; rather, use {@link org.jsoup.safety.Cleaner}
to clean HTML.
@return this, for chaining
@see Element#empty()
@see Elements#empty()
@see #clear()
*/
public Nodes remove() {
for (T node : this) {
node.remove();
}
return this;
}
/**
Get the combined outer HTML of all matched nodes.
@return string of all node's outer HTML.
@see Elements#text()
@see Elements#html()
*/
public String outerHtml() {
return stream()
.map(Node::outerHtml)
.collect(StringUtil.joining("\n"));
}
/**
Get the combined outer HTML of all matched nodes. Alias of {@link #outerHtml()}.
@return string of all the node's outer HTML.
@see Elements#text()
@see #outerHtml()
*/
@Override
public String toString() {
return outerHtml();
}
/**
Insert the supplied HTML before each matched node's outer HTML.
@param html HTML to insert before each node
@return this, for chaining
@see Element#before(String)
*/
public Nodes before(String html) {
for (T node : this) {
node.before(html);
}
return this;
}
/**
Insert the supplied HTML after each matched nodes's outer HTML.
@param html HTML to insert after each node
@return this, for chaining
@see Element#after(String)
*/
public Nodes after(String html) {
for (T node : this) {
node.after(html);
}
return this;
}
/**
Wrap the supplied HTML around each matched node. For example, with HTML
{@code This is Jsoup
},
doc.select("b").wrap("<i></i>");
becomes {@code This is jsoup
}
@param html HTML to wrap around each node, e.g. {@code }. Can be arbitrarily deep.
@return this (for chaining)
@see Element#wrap
*/
public Nodes wrap(String html) {
Validate.notEmpty(html);
for (T node : this) {
node.wrap(html);
}
return this;
}
// list-like methods
/**
Get the first matched element.
@return The first matched element, or null if contents is empty.
*/
public @Nullable T first() {
return isEmpty() ? null : get(0);
}
/**
Get the last matched element.
@return The last matched element, or null if contents is empty.
*/
public @Nullable T last() {
return isEmpty() ? null : get(size() - 1);
}
// ArrayList methods that update the DOM:
/**
Replace the node at the specified index in this list, and in the DOM.
@param index index of the node to replace
@param node node to be stored at the specified position
@return the old Node at this index
*/
@Override
public T set(int index, T node) {
Validate.notNull(node);
T old = super.set(index, node);
old.replaceWith(node);
return old;
}
/**
Remove the node at the specified index in this list, and from the DOM.
@param index the index of the node to be removed
@return the old node at this index
@see #deselect(int)
*/
@Override
public T remove(int index) {
T old = super.remove(index);
old.remove();
return old;
}
/**
Remove the specified node from this list, and from the DOM.
@param o node to be removed from this list, if present
@return if this list contained the Node
@see #deselect(Object)
*/
@Override
public boolean remove(Object o) {
int index = super.indexOf(o);
if (index == -1) {
return false;
} else {
remove(index);
return true;
}
}
/**
Remove the node at the specified index in this list, but not from the DOM.
@param index the index of the node to be removed
@return the old node at this index
@see #remove(int)
*/
public T deselect(int index) {
return super.remove(index);
}
/**
Remove the specified node from this list, but not from the DOM.
@param o node to be removed from this list, if present
@return if this list contained the Node
@see #remove(Object)
*/
public boolean deselect(Object o) {
return super.remove(o);
}
/**
Removes all the nodes from this list, and each of them from the DOM.
@see #deselectAll()
*/
@Override
public void clear() {
remove();
super.clear();
}
/**
Like {@link #clear()}, removes all the nodes from this list, but not from the DOM.
@see #clear()
*/
public void deselectAll() {
super.clear();
}
/**
Removes from this list, and from the DOM, each of the nodes that are contained in the specified collection and are
in this list.
@param c collection containing nodes to be removed from this list
@return {@code true} if nodes were removed from this list
*/
@Override
public boolean removeAll(Collection> c) {
boolean anyRemoved = false;
for (Object o : c) {
anyRemoved |= this.remove(o);
}
return anyRemoved;
}
/**
Retain in this list, and in the DOM, only the nodes that are in the specified collection and are in this list. In
other words, remove nodes from this list and the DOM any item that is in this list but not in the specified
collection.
@param toRemove collection containing nodes to be retained in this list
@return {@code true} if nodes were removed from this list
@since 1.17.1
*/
@Override
public boolean retainAll(Collection> toRemove) {
boolean anyRemoved = false;
for (Iterator it = this.iterator(); it.hasNext(); ) {
T el = it.next();
if (!toRemove.contains(el)) {
it.remove();
anyRemoved = true;
}
}
return anyRemoved;
}
/**
Remove from the list, and from the DOM, all nodes in this list that mach the given predicate.
@param filter a predicate which returns {@code true} for nodes to be removed
@return {@code true} if nodes were removed from this list
*/
@Override
public boolean removeIf(Predicate super T> filter) {
boolean anyRemoved = false;
for (Iterator it = this.iterator(); it.hasNext(); ) {
T node = it.next();
if (filter.test(node)) {
it.remove();
anyRemoved = true;
}
}
return anyRemoved;
}
/**
Replace each node in this list with the result of the operator, and update the DOM.
@param operator the operator to apply to each node
*/
@Override
public void replaceAll(UnaryOperator operator) {
for (int i = 0; i < this.size(); i++) {
this.set(i, operator.apply(this.get(i)));
}
}
}
================================================
FILE: src/main/java/org/jsoup/select/QueryParser.java
================================================
package org.jsoup.select;
import org.jsoup.helper.Regex;
import org.jsoup.internal.StringUtil;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.CDataNode;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.LeafNode;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.TokenQueue;
import org.jspecify.annotations.Nullable;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static org.jsoup.select.StructuralEvaluator.ImmediateParentRun;
import static org.jsoup.internal.Normalizer.normalize;
/**
* Parses a CSS selector into an Evaluator tree.
*/
public class QueryParser implements AutoCloseable {
private final static char[] Combinators = {'>', '+', '~'}; // ' ' is also a combinator, but found implicitly
private final static String[] AttributeEvals = new String[]{"=", "!=", "^=", "$=", "*=", "~="};
private final static char[] SequenceEnders = {',', ')'};
private final TokenQueue tq;
private final String query;
private boolean inNodeContext; // ::comment:contains should act on node value, vs element text
/**
* Create a new QueryParser.
* @param query CSS query
*/
private QueryParser(String query) {
Validate.notEmpty(query);
query = query.trim();
this.query = query;
this.tq = new TokenQueue(query);
}
/**
Parse a CSS query into an Evaluator. If you are evaluating the same query repeatedly, it may be more efficient to
parse it once and reuse the Evaluator.
@param query CSS query
@return Evaluator
@see Selector selector query syntax
@throws Selector.SelectorParseException if the CSS query is invalid
*/
public static Evaluator parse(String query) {
try (QueryParser p = new QueryParser(query)) {
return p.parse();
} catch (IllegalArgumentException e) {
throw new Selector.SelectorParseException(e.getMessage());
}
}
/**
Parse the query. We use this simplified expression of the grammar:
SelectorGroup ::= Selector (',' Selector)*
Selector ::= [ Combinator ] SimpleSequence ( Combinator SimpleSequence )*
SimpleSequence ::= [ TypeSelector ] ( ID | Class | Attribute | Pseudo )*
Pseudo ::= ':' Name [ '(' SelectorGroup ')' ]
Combinator ::= S+ // descendant (whitespace)
| '>' // child
| '+' // adjacent sibling
| '~' // general sibling
See selectors-4 for the real thing
*/
Evaluator parse() {
Evaluator eval = parseSelectorGroup();
tq.consumeWhitespace();
if (!tq.isEmpty())
throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
return eval;
}
Evaluator parseSelectorGroup() {
// SelectorGroup. Into an Or if > 1 Selector
Evaluator left = parseSelector();
while (tq.matchChomp(',')) {
Evaluator right = parseSelector();
left = or(left, right);
}
return left;
}
Evaluator parseSelector() {
// Selector ::= [ Combinator ] SimpleSequence ( Combinator SimpleSequence )*
tq.consumeWhitespace();
Evaluator left;
if (tq.matchesAny(Combinators)) {
// e.g. query is "> div"; left side is root element
left = new StructuralEvaluator.Root();
} else {
left = parseSimpleSequence();
}
while (true) {
char combinator = 0;
if (tq.consumeWhitespace())
combinator = ' '; // maybe descendant?
if (tq.matchesAny(Combinators)) // no, explicit
combinator = tq.consume();
else if (tq.matchesAny(SequenceEnders)) // , - space after simple like "foo , bar"; ) - close of :has()
break;
if (combinator != 0) {
Evaluator right = parseSimpleSequence();
left = combinator(left, combinator, right);
} else {
break;
}
}
return left;
}
Evaluator parseSimpleSequence() {
// SimpleSequence ::= TypeSelector? ( Hash | Class | Pseudo )*
Evaluator left = null;
tq.consumeWhitespace();
// one optional type selector
if (tq.matchesWord() || tq.matches("*|"))
left = byTag();
else if (tq.matchChomp('*'))
left = new Evaluator.AllElements();
// zero or more subclasses (#, ., [)
while(true) {
Evaluator right = parseSubclass();
if (right != null) {
left = and(left, right);
}
else break; // no more simple tokens
}
if (left == null)
throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
return left;
}
static Evaluator combinator(Evaluator left, char combinator, Evaluator right) {
switch (combinator) {
case '>':
ImmediateParentRun run = left instanceof ImmediateParentRun ?
(ImmediateParentRun) left : new ImmediateParentRun(left);
run.add(right);
return run;
case ' ':
return and(new StructuralEvaluator.Ancestor(left), right);
case '+':
return and(new StructuralEvaluator.ImmediatePreviousSibling(left), right);
case '~':
return and(new StructuralEvaluator.PreviousSibling(left), right);
default:
throw new Selector.SelectorParseException("Unknown combinator '%s'", combinator);
}
}
@Nullable Evaluator parseSubclass() {
// Subclass: ID | Class | Attribute | Pseudo
if (tq.matchChomp('#')) return byId();
else if (tq.matchChomp('.')) return byClass();
else if (tq.matches('[')) return byAttribute();
else if (tq.matchChomp("::")) return parseNodeSelector(); // ::comment etc
else if (tq.matchChomp(':')) return parsePseudoSelector();
else return null;
}
/** Merge two evals into an Or. */
static Evaluator or(Evaluator left, Evaluator right) {
if (left instanceof CombiningEvaluator.Or) {
((CombiningEvaluator.Or) left).add(right);
return left;
}
return new CombiningEvaluator.Or(left, right);
}
/** Merge two evals into an And. */
static Evaluator and(@Nullable Evaluator left, Evaluator right) {
if (left == null) return right;
if (left instanceof CombiningEvaluator.And) {
((CombiningEvaluator.And) left).add(right);
return left;
}
return new CombiningEvaluator.And(left, right);
}
private Evaluator parsePseudoSelector() {
final String pseudo = tq.consumeCssIdentifier();
switch (pseudo) {
case "lt":
return new Evaluator.IndexLessThan(consumeIndex());
case "gt":
return new Evaluator.IndexGreaterThan(consumeIndex());
case "eq":
return new Evaluator.IndexEquals(consumeIndex());
case "has":
return has();
case "is":
return is();
case "contains":
return contains(false);
case "containsOwn":
return contains(true);
case "containsWholeText":
return containsWholeText(false);
case "containsWholeOwnText":
return containsWholeText(true);
case "containsData":
return containsData();
case "matches":
return matches(false);
case "matchesOwn":
return matches(true);
case "matchesWholeText":
return matchesWholeText(false);
case "matchesWholeOwnText":
return matchesWholeText(true);
case "not":
return not();
case "nth-child":
return cssNthChild(false, false);
case "nth-last-child":
return cssNthChild(true, false);
case "nth-of-type":
return cssNthChild(false, true);
case "nth-last-of-type":
return cssNthChild(true, true);
case "first-child":
return new Evaluator.IsFirstChild();
case "last-child":
return new Evaluator.IsLastChild();
case "first-of-type":
return new Evaluator.IsFirstOfType();
case "last-of-type":
return new Evaluator.IsLastOfType();
case "only-child":
return new Evaluator.IsOnlyChild();
case "only-of-type":
return new Evaluator.IsOnlyOfType();
case "empty":
return new Evaluator.IsEmpty();
case "blank":
return new NodeEvaluator.BlankValue();
case "root":
return new Evaluator.IsRoot();
case "matchText":
return new Evaluator.MatchText();
default:
throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
}
}
// ::comment etc
private Evaluator parseNodeSelector() {
final String pseudo = tq.consumeCssIdentifier();
inNodeContext = true; // Enter node context
Evaluator left;
switch (pseudo) {
case "node":
left = new NodeEvaluator.InstanceType(Node.class, pseudo);
break;
case "leafnode":
left = new NodeEvaluator.InstanceType(LeafNode.class, pseudo);
break;
case "text":
left = new NodeEvaluator.InstanceType(TextNode.class, pseudo);
break;
case "comment":
left = new NodeEvaluator.InstanceType(Comment.class, pseudo);
break;
case "data":
left = new NodeEvaluator.InstanceType(DataNode.class, pseudo);
break;
case "cdata":
left = new NodeEvaluator.InstanceType(CDataNode.class, pseudo);
break;
default:
throw new Selector.SelectorParseException(
"Could not parse query '%s': unknown node type '::%s'", query, pseudo);
}
// Handle following subclasses in node context (like ::comment:contains())
Evaluator right;
while ((right = parseSubclass()) != null) {
left = and(left, right);
}
inNodeContext = false;
return left;
}
private Evaluator byId() {
String id = tq.consumeCssIdentifier();
Validate.notEmpty(id);
return new Evaluator.Id(id);
}
private Evaluator byClass() {
String className = tq.consumeCssIdentifier();
Validate.notEmpty(className);
return new Evaluator.Class(className.trim());
}
private Evaluator byTag() {
// todo - these aren't dealing perfectly with case sensitivity. For case sensitive parsers, we should also make
// the tag in the selector case-sensitive (and also attribute names). But for now, normalize (lower-case) for
// consistency - both the selector and the element tag
String tagName = normalize(tq.consumeElementSelector());
Validate.notEmpty(tagName);
// namespaces:
if (tagName.startsWith("*|")) { // namespaces: wildcard match equals(tagName) or ending in ":"+tagName
String plainTag = tagName.substring(2); // strip *|
return new CombiningEvaluator.Or(
new Evaluator.Tag(plainTag),
new Evaluator.TagEndsWith(":" + plainTag)
);
} else if (tagName.endsWith("|*")) { // ns|*
String ns = tagName.substring(0, tagName.length() - 2) + ":"; // strip |*, to ns:
return new Evaluator.TagStartsWith(ns);
} else if (tagName.contains("|")) { // flip "abc|def" to "abc:def"
tagName = tagName.replace("|", ":");
}
return new Evaluator.Tag(tagName);
}
private Evaluator byAttribute() {
try (TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']'))) {
return evaluatorForAttribute(cq);
}
}
private Evaluator evaluatorForAttribute(TokenQueue cq) {
String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val)
key = normalize(key);
Validate.notEmpty(key);
Validate.isFalse(key.equals("abs:"), "Absolute attribute key must have a name");
cq.consumeWhitespace();
final Evaluator eval;
if (cq.isEmpty()) {
if (key.startsWith("^"))
eval = new Evaluator.AttributeStarting(key.substring(1));
else if (key.equals("*")) // any attribute
eval = new Evaluator.AttributeStarting("");
else
eval = new Evaluator.Attribute(key);
} else {
if (cq.matchChomp('='))
eval = new Evaluator.AttributeWithValue(key, cq.remainder());
else if (cq.matchChomp("!="))
eval = new Evaluator.AttributeWithValueNot(key, cq.remainder());
else if (cq.matchChomp("^="))
eval = new Evaluator.AttributeWithValueStarting(key, cq.remainder());
else if (cq.matchChomp("$="))
eval = new Evaluator.AttributeWithValueEnding(key, cq.remainder());
else if (cq.matchChomp("*="))
eval = new Evaluator.AttributeWithValueContaining(key, cq.remainder());
else if (cq.matchChomp("~="))
eval = new Evaluator.AttributeWithValueMatching(key, Regex.compile(cq.remainder()));
else
throw new Selector.SelectorParseException(
"Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder());
}
return eval;
}
//pseudo selectors :first-child, :last-child, :nth-child, ...
private static final Pattern NthStepOffset = Pattern.compile("(([+-])?(\\d+)?)n(\\s*([+-])?\\s*\\d+)?", Pattern.CASE_INSENSITIVE);
private static final Pattern NthOffset = Pattern.compile("([+-])?(\\d+)");
private Evaluator cssNthChild(boolean last, boolean ofType) {
String arg = normalize(consumeParens()); // arg is like "odd", or "-n+2", within nth-child(odd)
final int step, offset;
if ("odd".equals(arg)) {
step = 2;
offset = 1;
} else if ("even".equals(arg)) {
step = 2;
offset = 0;
} else {
Matcher stepOffsetM, stepM;
if ((stepOffsetM = NthStepOffset.matcher(arg)).matches()) {
if (stepOffsetM.group(3) != null) // has digits, like 3n+2 or -3n+2
step = Integer.parseInt(stepOffsetM.group(1).replaceFirst("^\\+", ""));
else // no digits, might be like n+2, or -n+2. if group(2) == "-", it’s -1;
step = "-".equals(stepOffsetM.group(2)) ? -1 : 1;
offset =
stepOffsetM.group(4) != null ? Integer.parseInt(stepOffsetM.group(4).replaceFirst("^\\+", "")) : 0;
} else if ((stepM = NthOffset.matcher(arg)).matches()) {
step = 0;
offset = Integer.parseInt(stepM.group().replaceFirst("^\\+", ""));
} else {
throw new Selector.SelectorParseException("Could not parse nth-index '%s': unexpected format", arg);
}
}
return ofType
? (last ? new Evaluator.IsNthLastOfType(step, offset) : new Evaluator.IsNthOfType(step, offset))
: (last ? new Evaluator.IsNthLastChild(step, offset) : new Evaluator.IsNthChild(step, offset));
}
private String consumeParens() {
return tq.chompBalanced('(', ')');
}
private int consumeIndex() {
String index = consumeParens().trim();
Validate.isTrue(StringUtil.isNumeric(index), "Index must be numeric");
return Integer.parseInt(index);
}
// pseudo selector :has(el)
private Evaluator has() {
return parseNested(StructuralEvaluator.Has::new, ":has() must have a selector");
}
// pseudo selector :is()
private Evaluator is() {
return parseNested(StructuralEvaluator.Is::new, ":is() must have a selector");
}
private Evaluator parseNested(Function func, String err) {
Validate.isTrue(tq.matchChomp('('), err);
Evaluator eval = parseSelectorGroup();
Validate.isTrue(tq.matchChomp(')'), err);
return func.apply(eval);
}
// pseudo selector :contains(text), containsOwn(text)
private Evaluator contains(boolean own) {
String query = own ? ":containsOwn" : ":contains";
String searchText = TokenQueue.unescape(consumeParens());
Validate.notEmpty(searchText, query + "(text) query must not be empty");
if (inNodeContext)
return new NodeEvaluator.ContainsValue(searchText);
return own
? new Evaluator.ContainsOwnText(searchText)
: new Evaluator.ContainsText(searchText);
}
private Evaluator containsWholeText(boolean own) {
String query = own ? ":containsWholeOwnText" : ":containsWholeText";
String searchText = TokenQueue.unescape(consumeParens());
Validate.notEmpty(searchText, query + "(text) query must not be empty");
return own
? new Evaluator.ContainsWholeOwnText(searchText)
: new Evaluator.ContainsWholeText(searchText);
}
// pseudo selector :containsData(data)
private Evaluator containsData() {
String searchText = TokenQueue.unescape(consumeParens());
Validate.notEmpty(searchText, ":containsData(text) query must not be empty");
return new Evaluator.ContainsData(searchText);
}
// :matches(regex), matchesOwn(regex)
private Evaluator matches(boolean own) {
String query = own ? ":matchesOwn" : ":matches";
String regex = consumeParens(); // don't unescape, as regex bits will be escaped
Validate.notEmpty(regex, query + "(regex) query must not be empty");
Regex pattern = Regex.compile(regex);
if (inNodeContext)
return new NodeEvaluator.MatchesValue(pattern);
return own
? new Evaluator.MatchesOwn(pattern)
: new Evaluator.Matches(pattern);
}
// :matches(regex), matchesOwn(regex)
private Evaluator matchesWholeText(boolean own) {
String query = own ? ":matchesWholeOwnText" : ":matchesWholeText";
String regex = consumeParens(); // don't unescape, as regex bits will be escaped
Validate.notEmpty(regex, query + "(regex) query must not be empty");
Regex pattern = Regex.compile(regex);
return own
? new Evaluator.MatchesWholeOwnText(pattern)
: new Evaluator.MatchesWholeText(pattern);
}
// :not(selector)
private Evaluator not() {
String subQuery = consumeParens();
Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty");
return new StructuralEvaluator.Not(parse(subQuery));
}
@Override
public String toString() {
return query;
}
@Override
public void close() {
tq.close();
}
}
================================================
FILE: src/main/java/org/jsoup/select/Selector.java
================================================
package org.jsoup.select;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.parser.TokenQueue;
import org.jspecify.annotations.Nullable;
import java.util.Collection;
import java.util.HashSet;
import java.util.stream.Stream;
/**
CSS element selector, that finds elements matching a query.
Selector syntax
A selector is a chain of simple selectors, separated by combinators. Selectors are case-insensitive (including
against elements, attributes, and attribute values).
The universal selector {@code *} is implicit when no element selector is supplied (i.e. {@code .header} and
{@code *.header} are equivalent).
You can easily test different selectors using the Try jsoup online playground.
| Pattern | Matches | Example |
* | any element | * |
tag | elements with the given tag name | div |
*|E | elements of type E in any namespace (including non-namespaced) | *|name finds <dc:name> and <name> elements |
ns|E | elements of type E in the namespace ns | dc|name finds <dc:name> elements |
ns|* | all elements in the namespace ns | dc|* finds <dc:p> and <dc:img>elements |
#id | elements with attribute ID of "id" | div#wrap, #logo |
.class | elements with a class name of "class" | div.left, .result |
[attr] | elements with an attribute named "attr" (with any value) | a[href], [title] |
[^attrPrefix] | elements with an attribute name starting with "attrPrefix". Use to find elements with HTML5 datasets | [^data-], div[^data-] |
[attr=val] | elements with an attribute named "attr", and value equal to "val" | img[width=500], a[rel=nofollow] |
[attr="val"] | elements with an attribute named "attr", and value equal to "val" | span[hello="Cleveland"][goodbye="Columbus"], a[rel="nofollow"] |
[attr^=valPrefix] | elements with an attribute named "attr", and value starting with "valPrefix" | a[href^=http:] |
[attr$=valSuffix] | elements with an attribute named "attr", and value ending with "valSuffix" | img[src$=.png] |
[attr*=valContaining] | elements with an attribute named "attr", and value containing "valContaining" | a[href*=/search/] |
[attr~=regex] | elements with an attribute named "attr", and value matching the regular expression | img[src~=(?i)\\.(png|jpe?g)] |
[*] | elements with any attribute | p[*] finds p elements that have at least one attribute; p:not([*]) finds those with no attributes |
| The above may be combined in any order | div.header[title] |
Combinators |
E F | an F element descended from an E element | div a, .logo h1 |
E {@literal >} F | an F direct child of E | ol {@literal >} li |
E + F | an F element immediately preceded by sibling E | li + li, div.head + div |
E ~ F | an F element preceded by sibling E | h1 ~ p |
E, F, G | all matching elements E, F, or G | a[href], div, h3 |
Pseudo selectors |
:lt(n) | elements whose sibling index is less than n | td:lt(3) finds the first 3 cells of each row |
:gt(n) | elements whose sibling index is greater than n | td:gt(1) finds cells after skipping the first two |
:eq(n) | elements whose sibling index is equal to n | td:eq(0) finds the first cell of each row |
:has(selector) | elements that contains at least one element matching the selector | div:has(p) finds divs that contain p elements.
div:has(> a) selects div elements that have at least one direct child a element.
section:has(h1, h2) finds section elements that contain a h1 or a h2 element |
:is(selector list) | elements that match any of the selectors in the selector list | :is(h1, h2, h3, h4, h5, h6) finds any heading element.
:is(section, article) > :is(h1, h2) finds a h1 or h2 that is a direct child of a section or an article |
:not(selector) | elements that do not match the selector. See also {@link Elements#not(String)} | div:not(.logo) finds all divs that do not have the "logo" class.div:not(:has(div)) finds divs that do not contain divs.
|
:contains(text) | elements that contains the specified text. The search is case insensitive. The text may appear in the found element, or any of its descendants. The text is whitespace normalized. To find content that includes parentheses, escape those with a {@code \}. | p:contains(jsoup) finds p elements containing the text "jsoup".{@code p:contains(hello \(there\) finds p elements containing the text "Hello (There)"} |
:containsOwn(text) | elements that directly contain the specified text. The search is case insensitive. The text must appear in the found element, not any of its descendants. | p:containsOwn(jsoup) finds p elements with own text "jsoup". |
:containsData(data) | elements that contains the specified data. The contents of {@code script} and {@code style} elements, and {@code comment} nodes (etc) are considered data nodes, not text nodes. The search is case insensitive. The data may appear in the found element, or any of its descendants. | script:contains(jsoup) finds script elements containing the data "jsoup". |
:containsWholeText(text) | elements that contains the specified non-normalized text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, or any of its descendants. To find content that includes parentheses, escape those with a {@code \}. | p:containsWholeText(jsoup\nThe Java HTML Parser) finds p elements containing the text "jsoup\nThe Java HTML Parser" (and not other variations of whitespace or casing, as :contains() would. Note that {@code br} elements are presented as a newline. |
:containsWholeOwnText(text) | elements that directly contain the specified non-normalized text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, but not in its descendants. To find content that includes parentheses, escape those with a {@code \}. | p:containsWholeOwnText(jsoup\nThe Java HTML Parser) finds p elements directly containing the text "jsoup\nThe Java HTML Parser" (and not other variations of whitespace or casing, as :contains() would. Note that {@code br} elements are presented as a newline. |
:matches(regex) | elements containing whitespace normalized text that matches the specified regular expression. The text may appear in the found element, or any of its descendants. | td:matches(\\d+) finds table cells containing digits. div:matches((?i)login) finds divs containing the text, case insensitively. |
:matchesWholeText(regex) | elements containing non-normalized whole text that matches the specified regular expression. The text may appear in the found element, or any of its descendants. | td:matchesWholeText(\\s{2,}) finds table cells a run of at least two space characters. |
:matchesWholeOwnText(regex) | elements whose own non-normalized whole text matches the specified regular expression. The text must appear in the found element, not any of its descendants. | td:matchesWholeOwnText(\n\\d+) finds table cells directly containing digits following a neewline. |
| The above may be combined in any order and with other selectors | .light:contains(name):eq(0) |
:matchText | treats text nodes as elements, and so allows you to match against and select text nodes. Note that using this selector will modify the DOM, so you may want to {@code clone} your document before using. Deprecated. This selector is deprecated and will be removed in a future version. Migrate to ::textnode using the Element#selectNodes() method instead. | {@code p:matchText:firstChild} with input {@code One Two } will return one {@link org.jsoup.nodes.PseudoTextElement} with text "{@code One}". |
Structural pseudo selectors |
:root | The element that is the root of the document. In HTML, this is the html element | :root |
:nth-child(an+b) | elements that have an+b-1 siblings before it in the document tree, for any positive integer or zero value of n, and has a parent element. For values of a and b greater than zero, this effectively divides the element's children into groups of a elements (the last group taking the remainder), and selecting the bth element of each group. For example, this allows the selectors to address every other row in a table, and could be used to alternate the color of paragraph text in a cycle of four. The a and b values must be integers (positive, negative, or zero). The index of the first child of an element is 1.
Additionally, :nth-child() supports odd and even as arguments. odd is the same as 2n+1, and even is the same as 2n. | tr:nth-child(2n+1) finds every odd row of a table. :nth-child(10n-1) the 9th, 19th, 29th, etc, element. li:nth-child(5) the 5h li |
:nth-last-child(an+b) | elements that have an+b-1 siblings after it in the document tree. Otherwise like :nth-child() | tr:nth-last-child(-n+2) the last two rows of a table |
:nth-of-type(an+b) | pseudo-class notation represents an element that has an+b-1 siblings with the same expanded element name before it in the document tree, for any zero or positive integer value of n, and has a parent element | img:nth-of-type(2n+1) |
:nth-last-of-type(an+b) | pseudo-class notation represents an element that has an+b-1 siblings with the same expanded element name after it in the document tree, for any zero or positive integer value of n, and has a parent element | img:nth-last-of-type(2n+1) |
:first-child | elements that are the first child of some other element. | div {@literal >} p:first-child |
:last-child | elements that are the last child of some other element. | ol {@literal >} li:last-child |
:first-of-type | elements that are the first sibling of its type in the list of children of its parent element | dl dt:first-of-type |
:last-of-type | elements that are the last sibling of its type in the list of children of its parent element | tr {@literal >} td:last-of-type |
:only-child | elements that have a parent element and whose parent element have no other element children | |
:only-of-type | an element that has a parent element and whose parent element has no other element children with the same expanded element name | |
:empty | elements that contain no child elements or nodes, with the exception of blank text nodes, comments, XML declarations, and doctype declarations. In other words, it matches elements that are effectively empty of meaningful content. | li:not(:empty) |
Node pseudo selectors |
These selectors enable matching specific leaf nodes, including Comments, TextNodes. When used with {@link Element#select(String)}, these can be used with structural selectors such as :has() to refine which Elements are matched. To retrieve matching Nodes directly, use {@Element#selectNodes(String)}. |
| ::node | Matches any node | |
| ::leafnode | Matches any leaf-node (this is, a Node which is not an Element) | |
| ::comment | Matches a Comment node | |
| ::text | Matches a TextNode | |
| ::data | Matches a DataNode (e.g. the content of a script or a style element) | |
| ::cdata | Matches a CDataNode (which are only present in XML) | |
| ::node:contains(text) | Matches a node that has a (normalized, case-insensitive) value containing text. | ::comment:contains(foo bar) |
| ::node:matches(regex) | Matches a node that has a value matching the regex. | ::comment:matches(\\d+) |
| ::node:blank | Matches a node that has either no value, or a value of only whitespace. | ::comment:not(:blank) |
A word on using regular expressions in these selectors: depending on the content of the regex, you will need to quote the pattern using Pattern.quote("regex") for it to parse correctly through both the selector parser and the regex parser. E.g. String query = "div:matches(" + Pattern.quote(regex) + ");".
Escaping special characters: to match a tag, ID, or other selector that does not follow the regular CSS syntax, the query must be escaped with the \ character. For example, to match by ID {@code
}, use {@code document.select("#i\\.d")}.
@see Element#select(String css)
@see Element#selectFirst(String css)
@see Element#select(Evaluator eval)
@see Element#selectNodes(String css)
@see Element#selectNodes(String css, Class nodeType)
@see Elements#select(String css)
@see Element#selectXpath(String xpath) */
public class Selector {
// not instantiable
private Selector() {}
/**
Find Elements matching the CSS query.
@param query CSS selector
@param root root element to descend into
@return matching elements, empty if none
@throws Selector.SelectorParseException (unchecked) on an invalid CSS query.
*/
public static Elements select(String query, Element root) {
Validate.notEmpty(query);
return select(evaluatorOf(query), root);
}
/**
Find Elements matching the Evaluator.
@param evaluator CSS Evaluator
@param root root (context) element to start from
@return matching elements, empty if none
*/
public static Elements select(Evaluator evaluator, Element root) {
Validate.notNull(evaluator);
Validate.notNull(root);
return Collector.collect(evaluator, root);
}
/**
Finds a Stream of elements matching the CSS query.
@param query CSS selector
@param root root element to descend into
@return a Stream of matching elements, empty if none
@throws Selector.SelectorParseException (unchecked) on an invalid CSS query.
@since 1.19.1
*/
public static Stream selectStream(String query, Element root) {
Validate.notEmpty(query);
return selectStream(evaluatorOf(query), root);
}
/**
Finds a Stream of elements matching the evaluator.
@param evaluator CSS selector
@param root root element to descend into
@return matching elements, empty if none
@since 1.19.1
*/
public static Stream selectStream(Evaluator evaluator, Element root) {
Validate.notNull(evaluator);
Validate.notNull(root);
return Collector.stream(evaluator, root);
}
/**
Find elements matching the query, across multiple roots. Elements will be deduplicated (in the case of
overlapping hierarchies).
@param query CSS selector
@param roots root elements to descend into
@return matching elements, empty if none
*/
public static Elements select(String query, Iterable roots) {
Validate.notEmpty(query);
Validate.notNull(roots);
Evaluator evaluator = evaluatorOf(query);
Elements elements = new Elements();
HashSet seenElements = new HashSet<>(); // dedupe elements by identity, as .equals is ==
for (Element root : roots) {
selectStream(evaluator, root)
.filter(seenElements::add)
.forEach(elements::add);
}
return elements;
}
// exclude set. package open so that Elements can implement .not() selector.
static Elements filterOut(Collection elements, Collection outs) {
Elements output = new Elements();
for (Element el : elements) {
boolean found = false;
for (Element out : outs) {
if (el.equals(out)) {
found = true;
break;
}
}
if (!found)
output.add(el);
}
return output;
}
/**
Find the first Element that matches the query.
@param cssQuery CSS selector
@param root root element to descend into
@return the matching element, or null if none.
*/
public static @Nullable Element selectFirst(String cssQuery, Element root) {
Validate.notEmpty(cssQuery);
return Collector.findFirst(evaluatorOf(cssQuery), root);
}
/**
Find the first element matching the query, across multiple roots.
@param cssQuery CSS selector
@param roots root elements to descend into
@return the first matching element, or {@code null} if none
@since 1.19.1
*/
public static @Nullable Element selectFirst(String cssQuery, Iterable roots) {
Validate.notEmpty(cssQuery);
Validate.notNull(roots);
Evaluator evaluator = evaluatorOf(cssQuery);
for (Element root : roots) {
Element first = Collector.findFirst(evaluator, root);
if (first != null) return first;
}
return null;
}
/**
Given a CSS identifier (such as a tag, ID, or class), escape any CSS special characters that would otherwise not be
valid in a selector.
@see CSS Object Model, serialize an identifier
@since 1.20.1
*/
public static String escapeCssIdentifier(String in) {
return TokenQueue.escapeCssIdentifier(in);
}
/**
Consume a CSS identifier (ID or class) off the queue.
Note: For backwards compatibility this method supports improperly formatted CSS identifiers, e.g. {@code 1} instead
of {@code \31}.
@return The unescaped identifier.
@throws IllegalArgumentException if an invalid escape sequence was found.
@see CSS Syntax Module Level 3, Consume an ident sequence
@see CSS Syntax Module Level 3, ident-token
@since 1.20.1
*/
public static String unescapeCssIdentifier(String in) {
try (TokenQueue tq = new TokenQueue(in)) {
return tq.consumeCssIdentifier();
}
}
/**
Parse a CSS query into an Evaluator. If you are evaluating the same query repeatedly, it may be more efficient to
parse it once and reuse the Evaluator.
@param css CSS query
@return Evaluator
@see Selector selector query syntax
@throws Selector.SelectorParseException if the CSS query is invalid
@since 1.21.1
*/
public static Evaluator evaluatorOf(String css) {
return QueryParser.parse(css);
}
public static class SelectorParseException extends IllegalStateException {
public SelectorParseException(String msg) {
super(msg);
}
public SelectorParseException(String msg, Object... msgArgs) {
super(String.format(msg, msgArgs));
}
public SelectorParseException(Throwable cause, String msg, Object... msgArgs) {
super(String.format(msg, msgArgs), cause);
}
}
}
================================================
FILE: src/main/java/org/jsoup/select/StructuralEvaluator.java
================================================
package org.jsoup.select;
import org.jsoup.internal.SoftPool;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.LeafNode;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.NodeIterator;
import org.jsoup.nodes.TextNode;
import java.util.ArrayList;
import java.util.Map;
import java.util.WeakHashMap;
/**
* Base structural evaluator.
*/
abstract class StructuralEvaluator extends Evaluator {
final Evaluator evaluator;
boolean wantsNodes; // if the evaluator requested nodes, not just elements
public StructuralEvaluator(Evaluator evaluator) {
this.evaluator = evaluator;
wantsNodes = evaluator.wantsNodes();
}
@Override
boolean wantsNodes() {
return wantsNodes;
}
// Memoize inner matches, to save repeated re-evaluations of parent, sibling etc.
// root + element: Boolean matches. ThreadLocal in case the Evaluator is compiled then reused across multi threads
final ThreadLocal