Repository: jhy/jsoup Branch: master Commit: 8951f55c5965 Files: 237 Total size: 2.4 MB Directory structure: gitextract_knobfku8/ ├── .gitattributes ├── .github/ │ ├── dependabot.yml │ └── workflows/ │ ├── build.yml │ ├── cifuzz.yml │ └── codeql.yml ├── .gitignore ├── CHANGES.md ├── LICENSE ├── README.md ├── SECURITY.md ├── change-archive.txt ├── pom.xml └── src/ ├── main/ │ ├── java/ │ │ └── org/ │ │ └── jsoup/ │ │ ├── Connection.java │ │ ├── HttpStatusException.java │ │ ├── Jsoup.java │ │ ├── Progress.java │ │ ├── SerializationException.java │ │ ├── UnsupportedMimeTypeException.java │ │ ├── examples/ │ │ │ ├── HtmlToPlainText.java │ │ │ ├── ListLinks.java │ │ │ ├── Wikipedia.java │ │ │ └── package-info.java │ │ ├── helper/ │ │ │ ├── AuthenticationHandler.java │ │ │ ├── CookieUtil.java │ │ │ ├── DataUtil.java │ │ │ ├── HttpConnection.java │ │ │ ├── Re2jRegex.java │ │ │ ├── Regex.java │ │ │ ├── RequestAuthenticator.java │ │ │ ├── RequestDispatch.java │ │ │ ├── RequestExecutor.java │ │ │ ├── UrlBuilder.java │ │ │ ├── UrlConnectionExecutor.java │ │ │ ├── Validate.java │ │ │ ├── ValidationException.java │ │ │ ├── W3CDom.java │ │ │ └── package-info.java │ │ ├── internal/ │ │ │ ├── ControllableInputStream.java │ │ │ ├── Functions.java │ │ │ ├── Normalizer.java │ │ │ ├── QuietAppendable.java │ │ │ ├── SharedConstants.java │ │ │ ├── SimpleBufferedInput.java │ │ │ ├── SimpleStreamReader.java │ │ │ ├── SoftPool.java │ │ │ ├── StringUtil.java │ │ │ └── package-info.java │ │ ├── nodes/ │ │ │ ├── Attribute.java │ │ │ ├── Attributes.java │ │ │ ├── CDataNode.java │ │ │ ├── Comment.java │ │ │ ├── DataNode.java │ │ │ ├── Document.java │ │ │ ├── DocumentType.java │ │ │ ├── Element.java │ │ │ ├── Entities.java │ │ │ ├── EntitiesData.java │ │ │ ├── FormElement.java │ │ │ ├── LeafNode.java │ │ │ ├── Node.java │ │ │ ├── NodeIterator.java │ │ │ ├── NodeUtils.java │ │ │ ├── Printer.java │ │ │ ├── PseudoTextElement.java │ │ │ ├── Range.java │ │ │ ├── TextNode.java │ │ │ ├── XmlDeclaration.java │ │ │ └── package-info.java │ │ ├── package-info.java │ │ ├── parser/ │ │ │ ├── CharacterReader.java │ │ │ ├── HtmlTreeBuilder.java │ │ │ ├── HtmlTreeBuilderState.java │ │ │ ├── ParseError.java │ │ │ ├── ParseErrorList.java │ │ │ ├── ParseSettings.java │ │ │ ├── Parser.java │ │ │ ├── StreamParser.java │ │ │ ├── Tag.java │ │ │ ├── TagSet.java │ │ │ ├── Token.java │ │ │ ├── TokenData.java │ │ │ ├── TokenQueue.java │ │ │ ├── Tokeniser.java │ │ │ ├── TokeniserState.java │ │ │ ├── TreeBuilder.java │ │ │ ├── XmlTreeBuilder.java │ │ │ └── package-info.java │ │ ├── safety/ │ │ │ ├── Cleaner.java │ │ │ ├── Safelist.java │ │ │ └── package-info.java │ │ └── select/ │ │ ├── Collector.java │ │ ├── CombiningEvaluator.java │ │ ├── Elements.java │ │ ├── Evaluator.java │ │ ├── NodeEvaluator.java │ │ ├── NodeFilter.java │ │ ├── NodeTraversor.java │ │ ├── NodeVisitor.java │ │ ├── Nodes.java │ │ ├── QueryParser.java │ │ ├── Selector.java │ │ ├── StructuralEvaluator.java │ │ └── package-info.java │ ├── java11/ │ │ ├── module-info.java │ │ └── org/ │ │ └── jsoup/ │ │ └── helper/ │ │ ├── HttpClientExecutor.java │ │ └── RequestAuthHandler.java │ ├── javadoc/ │ │ └── overview.html │ └── resources/ │ └── META-INF/ │ └── proguard/ │ └── org.jsoup_jsoup.pro └── test/ ├── java/ │ └── org/ │ └── jsoup/ │ ├── JsoupTest.java │ ├── MultiLocaleExtension.java │ ├── SerializationExceptionTest.java │ ├── TextUtil.java │ ├── helper/ │ │ ├── AuthenticationHandlerTest.java │ │ ├── CookieUtilTest.java │ │ ├── DataUtilTest.java │ │ ├── HttpConnectionTest.java │ │ ├── RegexTest.java │ │ ├── ValidateTest.java │ │ └── W3CDomTest.java │ ├── integration/ │ │ ├── Benchmark.java │ │ ├── ConnectIT.java │ │ ├── ConnectTest.java │ │ ├── FuzzFixesIT.java │ │ ├── FuzzFixesTest.java │ │ ├── ParseTest.java │ │ ├── ProxyTest.java │ │ ├── SafelistExtensionTest.java │ │ ├── SessionIT.java │ │ ├── SessionTest.java │ │ ├── TestServer.java │ │ └── servlets/ │ │ ├── AuthFilter.java │ │ ├── BaseServlet.java │ │ ├── CookieServlet.java │ │ ├── DeflateServlet.java │ │ ├── EchoServlet.java │ │ ├── FileServlet.java │ │ ├── HelloServlet.java │ │ ├── InterruptedServlet.java │ │ ├── ProxyServlet.java │ │ ├── RedirectServlet.java │ │ └── SlowRider.java │ ├── internal/ │ │ ├── ControllableInputStreamTest.java │ │ ├── QuietAppendableTest.java │ │ ├── ReaderTest.java │ │ ├── SoftPoolTest.java │ │ └── StringUtilTest.java │ ├── nodes/ │ │ ├── AttributeTest.java │ │ ├── AttributesTest.java │ │ ├── BuildEntities.java │ │ ├── CommentTest.java │ │ ├── DataNodeTest.java │ │ ├── DocumentTest.java │ │ ├── DocumentTypeTest.java │ │ ├── ElementIT.java │ │ ├── ElementTest.java │ │ ├── EntitiesTest.java │ │ ├── FormElementTest.java │ │ ├── LeafNodeTest.java │ │ ├── NodeIteratorTest.java │ │ ├── NodeStreamTest.java │ │ ├── NodeTest.java │ │ ├── PrinterTest.java │ │ └── TextNodeTest.java │ ├── parser/ │ │ ├── AttributeParseTest.java │ │ ├── CharacterReaderTest.java │ │ ├── HtmlParserTest.java │ │ ├── HtmlTreeBuilderStateTest.java │ │ ├── HtmlTreeBuilderTest.java │ │ ├── ParserIT.java │ │ ├── ParserSettingsTest.java │ │ ├── ParserTest.java │ │ ├── PositionTest.java │ │ ├── StreamParserTest.java │ │ ├── TagSetTest.java │ │ ├── TagTest.java │ │ ├── TokenQueueTest.java │ │ ├── TokeniserStateTest.java │ │ ├── TokeniserTest.java │ │ └── XmlTreeBuilderTest.java │ ├── safety/ │ │ ├── CleanerTest.java │ │ └── SafelistTest.java │ └── select/ │ ├── CssTest.java │ ├── ElementsTest.java │ ├── EvaluatorDebug.java │ ├── EvaluatorTest.java │ ├── NodesTest.java │ ├── QueryParserTest.java │ ├── SelectorIT.java │ ├── SelectorTest.java │ ├── StructuralEvaluatorTest.java │ ├── TraversorTest.java │ └── XpathTest.java ├── java11/ │ └── org/ │ └── jsoup/ │ ├── helper/ │ │ ├── HttpClientExecutorTest.java │ │ └── HttpClientTestAccess.java │ └── integration/ │ ├── HttpClientConnectIT.java │ ├── HttpClientConnectTest.java │ ├── HttpClientSessionIT.java │ └── HttpClientSessionTest.java └── resources/ ├── bomtests/ │ ├── bom_utf16be.html │ ├── bom_utf16le.html │ ├── bom_utf32be.html │ ├── bom_utf32le.html │ └── bom_utf8.html ├── fuzztests/ │ ├── ex-inselect16.html │ └── garble.html ├── htmltests/ │ ├── README │ ├── adopt-1.html │ ├── basehref.html │ ├── charset-base.html │ ├── comments.html │ ├── escapes-across-buffer.html │ ├── form-tests.html │ ├── gzip.html │ ├── gzip.html.z │ ├── large.html │ ├── lowercase-charset-test.html │ ├── medium.html │ ├── meta-charset-1.html │ ├── meta-charset-2.html │ ├── meta-charset-3.html │ ├── namespaces.xhtml │ ├── table-invalid-elements.html │ ├── table-polymer-template.html │ ├── test-rss.xml │ ├── upload-form.html │ ├── xml-charset.xml │ └── xml-test.xml ├── local-cert/ │ ├── README.md │ ├── cert.conf │ ├── server.crt │ ├── server.key │ ├── server.p12 │ └── server.pfx └── printertests/ ├── input-1.html ├── outline-1.html ├── passthru-1.html └── pretty-1.html ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitattributes ================================================ * text=auto eol=lf ================================================ FILE: .github/dependabot.yml ================================================ version: 2 updates: - package-ecosystem: maven directory: / schedule: interval: weekly cooldown: default-days: 12 ignore: # Jetty 9.x needed for JDK8 compatibility; it still receives security updates. Only used in tests. - dependency-name: "org.eclipse.jetty:jetty-server" update-types: ["version-update:semver-major"] - dependency-name: "org.eclipse.jetty:jetty-servlet" update-types: ["version-update:semver-major"] # Et tu, junit? Keep us on 5, as 6 has min JDK17 - https://docs.junit.org/6.0.0-RC3/release-notes/#release-notes-6.0.0-M1 - dependency-name: "org.junit.jupiter:junit-jupiter" update-types: ["version-update:semver-major"] - package-ecosystem: github-actions directory: / schedule: interval: weekly cooldown: default-days: 12 ================================================ FILE: .github/workflows/build.yml ================================================ name: Build on: push: pull_request: jobs: test: runs-on: ${{ matrix.os }} strategy: matrix: os: [ubuntu-latest, windows-latest, macOS-latest] # choosing to run a reduced set of LTS, current, and next, to balance coverage and execution time java: [8, 17, 25] fail-fast: false name: Test JDK ${{ matrix.java }}, ${{ matrix.os }} steps: - name: Checkout uses: actions/checkout@v6 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v5 with: java-version: ${{ matrix.java }} distribution: 'zulu' cache: 'maven' - name: Maven Compile run: mvn -X compile -B --file pom.xml - name: Maven Verify run: mvn -X verify -B --file pom.xml ... ================================================ FILE: .github/workflows/cifuzz.yml ================================================ name: CIFuzz on: [pull_request] jobs: Fuzzing: runs-on: ubuntu-latest steps: - name: Build Fuzzers id: build uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master with: oss-fuzz-project-name: 'jsoup' dry-run: false language: jvm - name: Run Fuzzers uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master with: oss-fuzz-project-name: 'jsoup' fuzz-seconds: 600 dry-run: false language: jvm - name: Upload Crash uses: actions/upload-artifact@v7 if: failure() && steps.build.outcome == 'success' with: name: artifacts path: ./out/artifacts ================================================ FILE: .github/workflows/codeql.yml ================================================ name: CodeQL on: push: branches: - master pull_request: jobs: codeql: runs-on: ubuntu-latest name: "CodeQL" steps: - name: Checkout uses: actions/checkout@v6 - name: Set up JDK uses: actions/setup-java@v5 with: java-version: 17 distribution: 'temurin' cache: 'maven' - name: CodeQL Initialization uses: github/codeql-action/init@v4 with: languages: java queries: +security-and-quality - name: Autobuild uses: github/codeql-action/autobuild@v4 - name: CodeQL Analysis uses: github/codeql-action/analyze@v4 ================================================ FILE: .gitignore ================================================ .idea/ jsoup.iml jsoup.ipr jsoup.iws target/ .classpath .project .settings/ *Thrash* bin/ .vscode/ .java-version .DS_Store ================================================ FILE: CHANGES.md ================================================ # jsoup Changelog ## 1.22.2 (PENDING) ### Improvements * Expanded and clarified `NodeTraversor` support for in-place DOM rewrites during `NodeVisitor.head()`. Current-node edits such as `remove`, `replace`, and `unwrap` now recover more predictably, while traversal stays within the original root subtree. This makes single-pass tree cleanup and normalization visitors easier to write, for example when unwrapping presentational elements or replacing text nodes as you walk the DOM. [#2472](https://github.com/jhy/jsoup/issues/2472) * Documentation: clarified that a configured `Cleaner` may be reused across concurrent threads, and that shared `Safelist` instances should not be mutated while in use. [#2473](https://github.com/jhy/jsoup/issues/2473) ### Bug Fixes * Android (R8/ProGuard): added a rule to ignore the optional `re2j` dependency when not present. [#2459](https://github.com/jhy/jsoup/issues/2459) * Fixed a `NodeTraversor` regression in 1.21.2 where removing or replacing the current node during `head()` could revisit the replacement node and loop indefinitely. The traversal docs now also clarify which inserted nodes are visited in the current pass. [#2472](https://github.com/jhy/jsoup/issues/2472) * Parsing during charset sniffing no longer fails if an advisory `available()` call throws `IOException`, as seen on JDK 8 `HttpURLConnection`. [#2474](https://github.com/jhy/jsoup/issues/2474) * `Cleaner` no longer makes relative URL attributes in the input document absolute when cleaning or validating a `Document`. URL normalization now applies only to the cleaned output, and `Safelist.isSafeAttribute()` is side effect free. [#2475](https://github.com/jhy/jsoup/issues/2475) * `Cleaner` no longer duplicates enforced attributes when the input `Document` preserves attribute case. A case-variant source attribute is now replaced by the enforced attribute in the cleaned output. [#2476](https://github.com/jhy/jsoup/issues/2476) * If a per-request SOCKS proxy is configured, jsoup now avoids using the JDK `HttpClient`, because the JDK would silently ignore that proxy and attempt to connect directly. Those requests now fall back to the legacy `HttpURLConnection` transport instead, which does support SOCKS. [#2468](https://github.com/jhy/jsoup/issues/2468) ## 1.22.1 (2026-Jan-01) ### Improvements * Added support for using the `re2j` regular expression engine for regex-based CSS selectors (e.g. `[attr~=regex]`, `:matches(regex)`), which ensures linear-time performance for regex evaluation. This allows safer handling of arbitrary user-supplied query regexes. To enable, add the `com.google.re2j` dependency to your classpath, e.g.: ```xml com.google.re2j re2j 1.8 ``` (If you already have that dependency in your classpath, but you want to keep using the Java regex engine, you can disable re2j via `System.setProperty("jsoup.useRe2j", "false")`.) You can confirm that the re2j engine has been enabled correctly by calling `org.jsoup.helper.Regex.usingRe2j()`. [#2407](https://github.com/jhy/jsoup/pull/2407) * Added an instance method `Parser#unescape(String, boolean)` that unescapes HTML entities using the parser's configuration (e.g. to support error tracking), complementing the existing static utility `Parser.unescapeEntities(String, boolean)`. [#2396](https://github.com/jhy/jsoup/pull/2396) * Added a configurable maximum parser depth (to limit the number of open elements on stack) to both HTML and XML parsers. The HTML parser now defaults to a depth of 512 to match browser behavior, and protect against unbounded stack growth, while the XML parser keeps unlimited depth by default, but can opt into a limit via `org.jsoup.parser.Parser#setMaxDepth`. [#2421](https://github.com/jhy/jsoup/issues/2421) * Build: added CI coverage for JDK 25 [#2403](https://github.com/jhy/jsoup/pull/2403) * Build: added a CI fuzzer for contextual fragment parsing (in addition to existing full body HTML and XML fuzzers). [oss-fuzz #14041](https://github.com/google/oss-fuzz/pull/14041) ### Changes * Set a removal schedule of jsoup 1.24.1 for previously deprecated APIs. ### Bug Fixes * Previously cached child `Elements` of an `Element` were not correctly invalidated in `Node#replaceWith(Node)`, which could lead to incorrect results when subsequently calling `Element#children()`. [#2391](https://github.com/jhy/jsoup/issues/2391) * Attribute selector values are now compared literally without trimming. Previously, jsoup trimmed whitespace from selector values and from element attribute values, which could cause mismatches with browser behavior (e.g. `[attr=" foo "]`). Now matches align with the CSS specification and browser engines. [#2380](https://github.com/jhy/jsoup/issues/2380) * When using the JDK HttpClient, any system default proxy (`ProxySelector.getDefault()`) was ignored. Now, the system proxy is used if a per-request proxy is not set. [#2388](https://github.com/jhy/jsoup/issues/2388), [#2390](https://github.com/jhy/jsoup/pull/2390) * A `ValidationException` could be thrown in the adoption agency algorithm with particularly broken input. Now logged as a parse error. [#2393](https://github.com/jhy/jsoup/issues/2393) * Null characters in the HTML body were not consistently removed; and in foreign content were not correctly replaced. [#2395](https://github.com/jhy/jsoup/issues/2395) * An `IndexOutOfBoundsException` could be thrown when parsing a body fragment with crafted input. Now logged as a parse error. [#2397](https://github.com/jhy/jsoup/issues/2397), [#2406](https://github.com/jhy/jsoup/issues/2406) * When using StructuralEvaluators (e.g., a `parent child` selector) across many retained threads, their memoized results could also be retained, increasing memory use. These results are now cleared immediately after use, reducing overall memory consumption. [#2411](https://github.com/jhy/jsoup/issues/2411) * Cloning a `Parser` now preserves any custom `TagSet` applied to the parser. [#2422](https://github.com/jhy/jsoup/issues/2422), [#2423](https://github.com/jhy/jsoup/pull/2423) * Custom tags marked as `Tag.Void` now parse and serialize like the built-in void elements: they no longer consume following content, and the XML serializer emits the expected self-closing form. [#2425](https://github.com/jhy/jsoup/issues/2425) * The `
` element is once again classified as an inline tag (`Tag.isBlock() == false`), matching common developer expectations and its role as phrasing content in HTML, while pretty-printing and text extraction continue to treat it as a line break in the rendered output. [#2387](https://github.com/jhy/jsoup/issues/2387), [#2439](https://github.com/jhy/jsoup/issues/2439) * Fixed an intermittent truncation issue when fetching and parsing remote documents via `Jsoup.connect(url).get()`. On responses without a charset header, the initial charset sniff could sometimes (depending on buffering / `available()` behavior) be mistaken for end-of-stream and a partial parse reused, dropping trailing content. [#2448](https://github.com/jhy/jsoup/issues/2448) * `TagSet` copies no longer mutate their template during lazy lookups, preventing cross-thread `ConcurrentModificationException` when parsing with shared sessions. [#2453](https://github.com/jhy/jsoup/pull/2453) * Fixed parsing of `` `foreignObject` content nested within a `

`, which could incorrectly move the HTML subtree outside the SVG. [#2452](https://github.com/jhy/jsoup/issues/2452) ### Internal Changes * Deprecated internal helper `org.jsoup.internal.Functions` (for removal in v1.23.1). This was previously used to support older Android API levels without full `java.util.function` coverage; jsoup now requires core library desugaring so this indirection is no longer necessary. [#2412](https://github.com/jhy/jsoup/pull/2412) ## 1.21.2 (2025-Aug-25) ### Changes * Deprecated internal (yet visible) methods `Normalizer#normalize(String, bool)` and `Attribute#shouldCollapseAttribute(Document.OutputSettings)`. These will be removed in a future version. * Deprecated `Connection#sslSocketFactory(SSLSocketFactory)` in favor of the new `Connection#sslContext(SSLContext)`. Using `sslSocketFactory` will force the use of the legacy `HttpUrlConnection` implementation, which does not support HTTP/2. [#2370](https://github.com/jhy/jsoup/pull/2370) ### Improvements * When pretty-printing, if there are consecutive text nodes (via DOM manipulation), the non-significant whitespace between them will be collapsed. [#2349](https://github.com/jhy/jsoup/pull/2349). * Updated `Connection.Response#statusMessage()` to return a simple loggable string message (e.g. "OK") when using the `HttpClient` implementation, which doesn't otherwise return any server-set status message. [#2356](https://github.com/jhy/jsoup/issues/2346) * `Attributes#size()` and `Attributes#isEmpty()` now exclude any internal attributes (such as user data) from their count. This aligns with the attributes' serialized output and iterator. [#2369](https://github.com/jhy/jsoup/pull/2369) * Added `Connection#sslContext(SSLContext)` to provide a custom SSL (TLS) context to requests, supporting both the `HttpClient` and the legacy `HttUrlConnection` implementations. [#2370](https://github.com/jhy/jsoup/pull/2370) * Performance optimizations for DOM manipulation methods including when repeatedly removing an element's first child (`element.child(0).remove()`, and when using `Parser#parseBodyFragement()` to parse a large number of direct children. [#2373](https://github.com/jhy/jsoup/pull/2373). ### Bug Fixes * When parsing from an InputStream and a multibyte character happened to straddle a buffer boundary, the stream would not be completely read. [#2353](https://github.com/jhy/jsoup/issues/2353). * In `NodeTraversor`, if a last child element was removed during the `head()` call, the parent would be visited twice. [#2355](https://github.com/jhy/jsoup/issues/2355). * Cloning an Element that has an Attributes object would add an empty internal user-data attribute to that clone, which would cause unexpected results for `Attributes#size()` and `Attributes#isEmpty()`. [#2356](https://github.com/jhy/jsoup/issues/2356) * In a multithreaded application where multiple threads are calling `Element#children()` on the same element concurrently, a race condition could happen when the method was generating the internal child element cache (a filtered view of its child nodes). Since concurrent reads of DOM objects should be threadsafe without external synchronization, this method has been updated to execute atomically. [#2366](https://github.com/jhy/jsoup/issues/2366) * When parsing HTML with svg:script elements in SVG elements, don't enter the Text insertion mode, but continue to parse as foreign content. Otherwise, misnested HTML could then cause an IndexOutOfBoundsException. [#2374](https://github.com/jhy/jsoup/issues/2374) * Malformed HTML could throw an IndexOutOfBoundsException during the adoption agency. [#2377](https://github.com/jhy/jsoup/pull/2377). ## 1.21.1 (2025-Jun-23) ### Changes * Removed previously deprecated methods. [#2317](https://github.com/jhy/jsoup/pull/2317) * Deprecated the `:matchText` pseduo-selector due to its side effects on the DOM; use the new `::textnode` selector and the `Element#selectNodes(String css, Class type)` method instead. [#2343](https://github.com/jhy/jsoup/pull/2343) * Deprecated `Connection.Response#bufferUp()` in lieu of `Connection.Response#readFully()` which can throw a checked IOException. * Deprecated internal methods `Validate#ensureNotNull` (replaced by typed `Validate#expectNotNull`); protected HTML appenders from Attribute and Node. * If you happen to be using any of the deprecated methods, please take the opportunity now to migrate away from them, as they will be removed in a future release. ### Improvements * Enhanced the `Selector` to support direct matching against nodes such as comments and text nodes. For example, you can now find an element that follows a specific comment: `::comment:contains(prices) + p` will select `p` elements immediately after a `` comment. Supported types include `::node`, `::leafnode`, `::comment`, `::text`, `::data`, and `::cdata`. Node contextual selectors like `::node:contains(text)`, `:matches(regex)`, and `:blank` are also supported. Introduced `Element#selectNodes(String css)` and `Element#selectNodes(String css, Class nodeType)` for direct node selection. [#2324](https://github.com/jhy/jsoup/pull/2324) * Added `TagSet#onNewTag(Consumer customizer)`: register a callback that’s invoked for each new or cloned Tag when it’s inserted into the set. Enables dynamic tweaks of tag options (for example, marking all custom tags as self-closing, or everything in a given namespace as preserving whitespace). * Made `TokenQueue` and `CharacterReader` autocloseable, to ensure that they will release their buffers back to the buffer pool, for later reuse. * Added `Selector#evaluatorOf(String css)`, as a clearer way to obtain an Evaluator from a CSS query. An alias of `QueryParser.parse(String css)`. * Custom tags (defined via the `TagSet`) in a foreign namespace (e.g. SVG) can be configured to parse as data tags. * Added `NodeVisitor#traverse(Node)` to simplify node traversal calls (vs. importing `NodeTraversor`). * Updated the default user-agent string to improve compatibility. [#2341](https://github.com/jhy/jsoup/issues/2341) * The HTML parser now allows the specific text-data type (Data, RcData) to be customized for known tags. (Previously, that was only supported on custom tags.) [#2326](https://github.com/jhy/jsoup/issues/2326). * Added `Connection#readFully()` as a replacement for `Connection#bufferUp()` with an explicit IOException. Similarly, added `Connection#readBody()` over `Connection#body()`. Deprecated `Connection#bufferUp()`. [#2327](https://github.com/jhy/jsoup/pull/2327) * When serializing HTML, the `<` and `>` characters are now escaped in attributes. This helps prevent a class of mutation XSS attacks. [#2337](https://github.com/jhy/jsoup/pull/2337) * Changed `Connection` to prefer using the JDK's HttpClient over HttpUrlConnection, if available, to enable HTTP/2 support by default. Users can disable via `-Djsoup.useHttpClient=false`. [#2340](https://github.com/jhy/jsoup/pull/2340) ### Bug Fixes * The contents of a `script` in a `svg` foreign context should be parsed as script data, not text. [#2320](https://github.com/jhy/jsoup/issues/2320) * `Tag#isFormSubmittable()` was updating the Tag's options. [#2323](https://github.com/jhy/jsoup/issues/2323) * The HTML pretty-printer would incorrectly trim whitespace when text followed an inline element in a block element. [#2325](https://github.com/jhy/jsoup/issues/2325) * Custom tags with hyphens or other non-letter characters in their names now work correctly as Data or RcData tags. Their closing tags are now tokenized properly. [#2332](https://github.com/jhy/jsoup/issues/2332) * When cloning an Element, the clone would retain the source's cached child Element list (if any), which could lead to incorrect results when modifying the clone's child elements. [#2334](https://github.com/jhy/jsoup/issues/2334) ## 1.20.1 (2025-Apr-29) ### Changes * To better follow the HTML5 spec and current browsers, the HTML parser no longer allows self-closing tags (``) to close HTML elements by default. Foreign content (SVG, MathML), and content parsed with the XML parser, still supports self-closing tags. If you need specific HTML tags to support self-closing, you can register a custom tag via the `TagSet` configured in `Parser.tagSet()`, using `Tag#set(Tag.SelfClose)`. Standard void tags (such as ``, `
`, etc.) continue to behave as usual and are not affected by this change. [#2300](https://github.com/jhy/jsoup/issues/2300). * The following internal components have been **deprecated**. If you do happen to be using any of these, please take the opportunity now to migrate away from them, as they will be removed in jsoup 1.21.1. * `ChangeNotifyingArrayList`, `Document.updateMetaCharsetElement()`, `Document.updateMetaCharsetElement(boolean)`, `HtmlTreeBuilder.isContentForTagData(String)`, `Parser.isContentForTagData(String)`, `Parser.setTreeBuilder(TreeBuilder)`, `Tag.formatAsBlock()`, `Tag.isFormListed()`, `TokenQueue.addFirst(String)`, `TokenQueue.chompTo(String)`, `TokenQueue.chompToIgnoreCase(String)`, `TokenQueue.consumeToIgnoreCase(String)`, `TokenQueue.consumeWord()`, `TokenQueue.matchesAny(String...)` ### Functional Improvements * Rebuilt the HTML pretty-printer, to simplify and consolidate the implementation, improve consistency, support custom Tags, and provide a cleaner path for ongoing improvements. The specific HTML produced by the pretty-printer may be different from previous versions. [#2286](https://github.com/jhy/jsoup/issues/2286). * Added the ability to define custom tags, and to modify properties of known tags, via the `TagSet` tag collection. Their properties can impact both the parse and how content is serialized (output as HTML or XML). [#2285](https://github.com/jhy/jsoup/issues/2285). * `Element.cssSelector()` will prefer to return shorter selectors by using ancestor IDs when available and unique. E.g. `#id > div > p` instead of `html > body > div > div > p` [#2283](https://github.com/jhy/jsoup/pull/2283). * Added `Elements.deselect(int index)`, `Elements.deselect(Object o)`, and `Elements.deselectAll()` methods to remove elements from the `Elements` list without removing them from the underlying DOM. Also added `Elements.asList()` method to get a modifiable list of elements without affecting the DOM. (Individual Elements remain linked to the DOM.) [#2100](https://github.com/jhy/jsoup/issues/2100). * Added support for sending a request body from an InputStream with `Connection.requestBodyStream(InputStream stream)`. [#1122](https://github.com/jhy/jsoup/issues/1122). * The XML parser now supports scoped xmlns: prefix namespace declarations, and applies the correct namespace to Tags and Attributes. Also, added `Tag#prefix()`, `Tag#localName()`, `Attribute#prefix()`, `Attribute#localName()`, and `Attribute#namespace()` to retrieve these. [#2299](https://github.com/jhy/jsoup/issues/2299). * CSS identifiers are now escaped and unescaped correctly to the CSS spec. `Element#cssSelector()` will emit appropriately escaped selectors, and the QueryParser supports those. Added `Selector.escapeCssIdentifier()` and `Selector.unescapeCssIdentifier()`. [#2297](https://github.com/jhy/jsoup/pull/2297), [#2305](https://github.com/jhy/jsoup/pull/2305) ### Structure and Performance Improvements * Refactored the CSS `QueryParser` into a clearer recursive descent parser. [#2310](https://github.com/jhy/jsoup/pull/2310). * CSS selectors with consecutive combinators (e.g. `div >> p`) will throw an explicit parse exception. [#2311](https://github.com/jhy/jsoup/pull/2311). * Performance: reduced the shallow size of an Element from 40 to 32 bytes, and the NodeList from 32 to 24. [#2307](https://github.com/jhy/jsoup/pull/2307). * Performance: reduced GC load of new StringBuilders when tokenizing input HTML. [#2304](https://github.com/jhy/jsoup/pull/2304). * Made `Parser` instances threadsafe, so that inadvertent use of the same instance across threads will not lead to errors. For actual concurrency, use `Parser#newInstance()` per thread. [#2314](https://github.com/jhy/jsoup/pull/2314). ### Bug Fixes * Element names containing characters invalid in XML are now normalized to valid XML names when serializing. [#1496](https://github.com/jhy/jsoup/issues/1496). * When serializing to XML, characters that are invalid in XML 1.0 should be removed (not encoded). [#1743](https://github.com/jhy/jsoup/issues/1743). * When converting a `Document` to the W3C DOM in `W3CDom`, elements with an attribute in an undeclared namespace now get a declaration of `xmlns:prefix="undefined"`. This allows subsequent serialization to XML via `W3CDom.asString()` to succeed. [#2087](https://github.com/jhy/jsoup/issues/2087). * The `StreamParser` could emit the final elements of a document twice, due to how `onNodeCompleted` was fired when closing out the stack. [#2295](https://github.com/jhy/jsoup/issues/2295). * When parsing with the XML parser and error tracking enabled, the trailing `?` in `` would incorrectly emit an error. [#2298](https://github.com/jhy/jsoup/issues/2298). * Calling `Element#cssSelector()` on an element with combining characters in the class or ID now produces the correct output. [#1984](https://github.com/jhy/jsoup/issues/1984). ## 1.19.1 (2025-Mar-04) ### Changes * Added support for **http/2** requests in `Jsoup.connect()`, when running on Java 11+, via the Java HttpClient implementation. [#2257](https://github.com/jhy/jsoup/pull/2257). * In this version of jsoup, the default is to make requests via the HttpUrlConnection implementation: use **`System.setProperty("jsoup.useHttpClient", "true");`** to enable making requests via the HttpClient instead , which will enable http/2 support, if available. This will become the default in a later version of jsoup, so now is a good time to validate it. * If you are repackaging the jsoup jar in your deployment (i.e. creating a shaded- or a fat-jar), make sure to specify that as a Multi-Release JAR. * If the `HttpClient` impl is not available in your JRE, requests will continue to be made via `HttpURLConnection` (in `http/1.1` mode). * Updated the minimum Android API Level validation from 10 to **21**. As with previous jsoup versions, Android developers need to enable core library desugaring. The minimum Java version remains Java 8. [#2173](https://github.com/jhy/jsoup/pull/2173) * Removed previously deprecated class: `org.jsoup.UncheckedIOException` (replace with `java.io.UncheckedIOException`); moved previously deprecated method `Element Element#forEach(Consumer)` to `void Element#forEach(Consumer())`. [#2246](https://github.com/jhy/jsoup/pull/2246) * Deprecated the methods `Document#updateMetaCharsetElement(boolean)` and `Document#updateMetaCharsetElement()`, as the setting had no effect. When `Document#charset(Charset)` is called, the document's meta charset or XML encoding instruction is always set. [#2247](https://github.com/jhy/jsoup/pull/2247) ### Improvements * When cleaning HTML with a `Safelist` that preserves relative links, the `isValid()` method will now consider these links valid. Additionally, the enforced attribute `rel=nofollow` will only be added to external links when configured in the safelist. [#2245](https://github.com/jhy/jsoup/pull/2245) * Added `Element#selectStream(String query)` and `Element#selectStream(Evaluator)` methods, that return a `Stream` of matching elements. Elements are evaluated and returned as they are found, and the stream can be terminated early. [#2092](https://github.com/jhy/jsoup/pull/2092) * `Element` objects now implement `Iterable`, enabling them to be used in enhanced for loops. * Added support for fragment parsing from a `Reader` via `Parser#parseFragmentInput(Reader, Element, String)`. [#1177](https://github.com/jhy/jsoup/issues/1177) * Reintroduced CLI executable examples, in `jsoup-examples.jar`. [#1702](https://github.com/jhy/jsoup/issues/1702) * Optimized performance of selectors like `#id .class` (and other similar descendant queries) by around 4.6x, by better balancing the Ancestor evaluator's cost function in the query planner. [#2254](https://github.com/jhy/jsoup/issues/2254) * Removed the legacy parsing rules for `` tags, which would autovivify a `form` element with labels. This is no longer in the spec. * Added `Elements.selectFirst(String cssQuery)` and `Elements.expectFirst(String cssQuery)`, to select the first matching element from an `Elements` list. [#2263](https://github.com/jhy/jsoup/pull/2263/) * When parsing with the XML parser, XML Declarations and Processing Instructions are directly handled, vs bouncing through the HTML parser's bogus comment handler. Serialization for non-doctype declarations no longer end with a spurious `!`. [#2275](https://github.com/jhy/jsoup/pull/2275) * When converting parsed HTML to XML or the W3C DOM, element names containing `<` are normalized to `_` to ensure valid XML. For example, `` becomes ``, as XML does not allow `<` in element names, but HTML5 does. [#2276](https://github.com/jhy/jsoup/pull/2276) * Reimplemented the HTML5 Adoption Agency Algorithm to the current spec. This handles mis-nested formating / structural elements. [#2278](https://github.com/jhy/jsoup/pull/2278) ### Bug Fixes * If an element has an `;` in an attribute name, it could not be converted to a W3C DOM element, and so subsequent XPath queries could miss that element. Now, the attribute name is more completely normalized. [#2244](https://github.com/jhy/jsoup/issues/2244) * For backwards compatibility, reverted the internal attribute key for doctype names to "name". [#2241](https://github.com/jhy/jsoup/issues/2241) * In `Connection`, skip cookies that have no name, rather than throwing a validation exception. [#2242](https://github.com/jhy/jsoup/issues/2242) * When running on JDK 1.8, the error `java.lang.NoSuchMethodError: java.nio.ByteBuffer.flip()Ljava/nio/ByteBuffer;` could be thrown when calling `Response#body()` after parsing from a URL and the buffer size was exceeded. [#2250](https://github.com/jhy/jsoup/pull/2250) * For backwards compatibility, allow `null` InputStream inputs to `Jsoup.parse(InputStream stream, ...)`, by returning an empty `Document`. [#2252](https://github.com/jhy/jsoup/issues/2252) * A `template` tag containing an `li` within an open `li` would be parsed incorrectly, as it was not recognized as a "special" tag (which have additional processing rules). Also, added the SVG and MathML namespace tags to the list of special tags. [#2258](https://github.com/jhy/jsoup/issues/2258) * A `template` tag containing a `button` within an open `button` would be parsed incorrectly, as the "in button scope" check was not aware of the `template` element. Corrected other instances including MathML and SVG elements, also. [#2271](https://github.com/jhy/jsoup/issues/2271) * An `:nth-child` selector with a negative digit-less step, such as `:nth-child(-n+2)`, would be parsed incorrectly as a positive step, and so would not match as expected. [#1147](https://github.com/jhy/jsoup/issues/1147) * Calling `doc.charset(charset)` on an empty XML document would throw an `IndexOutOfBoundsException`. [#2266](https://github.com/jhy/jsoup/issues/2266) * Fixed a memory leak when reusing a nested `StructuralEvaluator` (e.g., a selector ancestor chain like `A B C`) by ensuring cache reset calls cascade to inner members. [#2277](https://github.com/jhy/jsoup/issues/2277) * Concurrent calls to `doc.clone().append(html)` were not supported. When a document was cloned, its `Parser` was not cloned but was a shallow copy of the original parser. [#2281](https://github.com/jhy/jsoup/issues/2281) ## 1.18.3 (2024-Dec-02) ### Bug Fixes * When serializing to XML, attribute names containing `-`, `.`, or digits were incorrectly marked as invalid and removed. [2235](https://github.com/jhy/jsoup/issues/2235) ## 1.18.2 (2024-Nov-27) ### Improvements * Optimized the throughput and memory use throughout the input read and parse flows, with heap allocations and GC down between -6% and -89%, and throughput improved up to +143% for small inputs. Most inputs sizes will see throughput increases of ~ 20%. These performance improvements come through recycling the backing `byte[]` and `char[]` arrays used to read and parse the input. [2186](https://github.com/jhy/jsoup/pull/2186) * Speed optimized `html()` and `Entities.escape()` when the input contains UTF characters in a supplementary plane, by around 49%. [2183](https://github.com/jhy/jsoup/pull/2183) * The form associated elements returned by `FormElement.elements()` now reflect changes made to the DOM, subsequently to the original parse. [2140](https://github.com/jhy/jsoup/issues/2140) * In the `TreeBuilder`, the `onNodeInserted()` and `onNodeClosed()` events are now also fired for the outermost / root `Document` node. This enables source position tracking on the Document node (which was previously unset). And it also enables the node traversor to see the outer Document node. [2182](https://github.com/jhy/jsoup/pull/2182) * Selected Elements can now be position swapped inline using `Elements#set()`. [2212](https://github.com/jhy/jsoup/issues/2212) ### Bug Fixes * `Element.cssSelector()` would fail if the element's class contained a `*` character. [2169](https://github.com/jhy/jsoup/issues/2169) * When tracking source ranges, a text node following an invalid self-closing element may be left untracked. [2175](https://github.com/jhy/jsoup/issues/2175) * When a document has no doctype, or a doctype not named `html`, it should be parsed in Quirks Mode. [2197](https://github.com/jhy/jsoup/issues/2197) * With a selector like `div:has(span + a)`, the `has()` component was not working correctly, as the inner combining query caused the evaluator to match those against the outer's siblings, not children. [2187](https://github.com/jhy/jsoup/issues/2187) * A selector query that included multiple `:has()` components in a nested `:has()` might incorrectly execute. [2131](https://github.com/jhy/jsoup/issues/2131) * When cookie names in a response are duplicated, the simple view of cookies available via `Connection.Response#cookies()` will provide the last one set. Generally it is better to use the [Jsoup.newSession](https://jsoup.org/cookbook/web/request-session) method to maintain a cookie jar, as that applies appropriate path selection on cookies when making requests. [1831](https://github.com/jhy/jsoup/issues/1831) * When parsing named HTML entities, base entities should resolve if they are a prefix of the input token (and not in an attribute). [2207](https://github.com/jhy/jsoup/issues/2207) * Fixed incorrect tracking of source ranges for attributes merged from late-occurring elements that were implicitly created (`html` or `body`). [2204](https://github.com/jhy/jsoup/issues/2204) * Follow the current HTML specification in the tokenizer to allow `<` as part of a tag name, instead of emitting it as a character node. [2230](https://github.com/jhy/jsoup/issues/2230) * Similarly, allow a `<` as the start of an attribute name, vs creating a new element. The previous behavior was intended to parse closer to what we anticipated the author's intent to be, but that does not align to the spec or to how browsers behave. [1483](https://github.com/jhy/jsoup/issues/1483) ## 1.18.1 (2024-Jul-10) ### Improvements * **Stream Parser**: A `StreamParser` provides a progressive parse of its input. As each `Element` is completed, it is emitted via a `Stream` or `Iterator` interface. Elements returned will be complete with all their children, and an (empty) next sibling, if applicable. Elements (or their children) may be removed from the DOM during the parse, for e.g. to conserve memory, providing a mechanism to parse an input document that would otherwise be too large to fit into memory, yet still providing a DOM interface to the document and its elements. Additionally, the parser provides a `selectFirst(String query)` / `selectNext(String query)`, which will run the parser until a hit is found, at which point the parse is suspended. It can be resumed via another `select()` call, or via the `stream()` or `iterator()` methods. [2096](https://github.com/jhy/jsoup/pull/2096) * **Download Progress**: added a Response Progress event interface, which reports progress and URLs are downloaded (and parsed). Supported on both a session and a single connection level. [2164](https://github.com/jhy/jsoup/pull/2164), [656](https://github.com/jhy/jsoup/issues/656) * Added `Path` accepting parse methods: `Jsoup.parse(Path)`, `Jsoup.parse(path, charsetName, baseUri, parser)`, etc. [2055](https://github.com/jhy/jsoup/pull/2055) * Updated the `button` tag configuration to include a space between multiple button elements in the `Element.text()` method. [2105](https://github.com/jhy/jsoup/issues/2105) * Added support for the `ns|*` all elements in namespace Selector. [1811](https://github.com/jhy/jsoup/issues/1811) * When normalising attribute names during serialization, invalid characters are now replaced with `_`, vs being stripped. This should make the process clearer, and generally prevent an invalid attribute name being coerced unexpectedly. [2143](https://github.com/jhy/jsoup/issues/2143) ### Changes * Removed previously deprecated internal classes and methods. [2094](https://github.com/jhy/jsoup/pull/2094) * Build change: the built jar's OSGi manifest no longer imports itself. [2158](https://github.com/jhy/jsoup/issues/2158) ### Bug Fixes * When tracking source positions, if the first node was a TextNode, its position was incorrectly set to `-1.` [2106](https://github.com/jhy/jsoup/issues/2106) * When connecting (or redirecting) to URLs with characters such as `{`, `}` in the path, a Malformed URL exception would be thrown (if in development), or the URL might otherwise not be escaped correctly (if in production). The URL encoding process has been improved to handle these characters correctly. [2142](https://github.com/jhy/jsoup/issues/2142) * When using `W3CDom` with a custom output Document, a Null Pointer Exception would be thrown. [2114](https://github.com/jhy/jsoup/pull/2114) * The `:has()` selector did not match correctly when using sibling combinators (like e.g.: `h1:has(+h2)`). [2137](https://github.com/jhy/jsoup/issues/2137) * The `:empty` selector incorrectly matched elements that started with a blank text node and were followed by non-empty nodes, due to an incorrect short-circuit. [2130](https://github.com/jhy/jsoup/issues/2130) * `Element.cssSelector()` would fail with "Did not find balanced marker" when building a selector for elements that had a `(` or `[` in their class names. And selectors with those characters escaped would not match as expected. [2146](https://github.com/jhy/jsoup/issues/2146) * Updated `Entities.escape(string)` to make the escaped text suitable for both text nodes and attributes (previously was only for text nodes). This does not impact the output of `Element.html()` which correctly applies a minimal escape depending on if the use will be for text data or in a quoted attribute. [1278](https://github.com/jhy/jsoup/issues/1278) * Fuzz: a Stack Overflow exception could occur when resolving a crafted `` URL, in the normalizing regex. [2165](https://github.com/jhy/jsoup/issues/2165) --- ## 1.17.2 (2023-Dec-29) ### Improvements * **Attribute object accessors**: Added `Element.attribute(String)` and `Attributes.attribute(String)` to more simply obtain an `Attribute` object. [2069](https://github.com/jhy/jsoup/issues/2069) * **Attribute source tracking**: If source tracking is on, and an Attribute's key is changed ( via `Attribute.setKey(String)`), the source range is now still tracked in `Attribute.sourceRange()`. [2070](https://github.com/jhy/jsoup/issues/2070) * **Wildcard attribute selector**: Added support for the `[*]` element with any attribute selector. And also restored support for selecting by an empty attribute name prefix (`[^]`). [2079](https://github.com/jhy/jsoup/issues/2079) ### Bug Fixes * **Mixed-cased source position**: When tracking the source position of attributes, if the source attribute name was mix-cased but the parser was lower-case normalizing attribute names, the source position for that attribute was not tracked correctly. [2067](https://github.com/jhy/jsoup/issues/2067) * **Source position NPE**: When tracking the source position of a body fragment parse, a null pointer exception was thrown. [2068](https://github.com/jhy/jsoup/issues/2068) * **Multi-point emoji entity**: A multi-point encoded emoji entity may be incorrectly decoded to the replacement character. [2074](https://github.com/jhy/jsoup/issues/2074) * **Selector sub-expressions**: (Regression) in a selector like `parent [attr=va], other`, the `, OR` was binding to `[attr=va]` instead of `parent [attr=va]`, causing incorrect selections. The fix includes a EvaluatorDebug class that generates a sexpr to represent the query, allowing simpler and more thorough query parse tests. [2073](https://github.com/jhy/jsoup/issues/2073) * **XML CData output**: When generating XML-syntax output from parsed HTML, script nodes containing (pseudo) CData sections would have an extraneous CData section added, causing script execution errors. Now, the data content is emitted in a HTML/XML/XHTML polyglot format, if the data is not already within a CData section. [2078](https://github.com/jhy/jsoup/issues/2078) * **Thread safety**: The `:has` evaluator held a non-thread-safe Iterator, and so if an Evaluator object was shared across multiple concurrent threads, a NoSuchElement exception may be thrown, and the selected results may be incorrect. Now, the iterator object is a thread-local. [2088](https://github.com/jhy/jsoup/issues/2088) --- Older changes for versions 0.1.1 (2010-Jan-31) through 1.17.1 (2023-Nov-27) may be found in [change-archive.txt](./change-archive.txt). ================================================ FILE: LICENSE ================================================ The MIT License Copyright (c) 2009-2026 Jonathan Hedley Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # jsoup: Java HTML Parser **jsoup** is a Java library that makes it easy to work with real-world HTML and XML. It offers an easy-to-use API for URL fetching, data parsing, extraction, and manipulation using DOM API methods, CSS, and xpath selectors. **jsoup** implements the [WHATWG HTML5](https://html.spec.whatwg.org/multipage/) specification, and parses HTML to the same DOM as modern browsers. * scrape and [parse](https://jsoup.org/cookbook/input/parse-document-from-string) HTML from a URL, file, or string * find and [extract data](https://jsoup.org/cookbook/extracting-data/selector-syntax), using DOM traversal or CSS selectors * manipulate the [HTML elements](https://jsoup.org/cookbook/modifying-data/set-html), attributes, and text * [clean](https://jsoup.org/cookbook/cleaning-html/safelist-sanitizer) user-submitted content against a safe-list, to prevent XSS attacks * output tidy HTML jsoup is designed to deal with all varieties of HTML found in the wild; from pristine and validating, to invalid tag-soup; jsoup will create a sensible parse tree. See [**jsoup.org**](https://jsoup.org/) for downloads and the full [API documentation](https://jsoup.org/apidocs/). [![Build Status](https://github.com/jhy/jsoup/workflows/Build/badge.svg)](https://github.com/jhy/jsoup/actions?query=workflow%3ABuild) ## Example Fetch the [Wikipedia](https://en.wikipedia.org/wiki/Main_Page) homepage, parse it to a [DOM](https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Introduction), and select the headlines from the *In the News* section into a list of [Elements](https://jsoup.org/apidocs/org/jsoup/select/Elements.html): ```java Document doc = Jsoup.connect("https://en.wikipedia.org/").get(); log(doc.title()); Elements newsHeadlines = doc.select("#mp-itn b a"); for (Element headline : newsHeadlines) { log("%s\n\t%s", headline.attr("title"), headline.absUrl("href")); } ``` [Online sample](https://try.jsoup.org/~LGB7rk_atM2roavV0d-czMt3J_g), [full source](https://github.com/jhy/jsoup/blob/master/src/main/java/org/jsoup/examples/Wikipedia.java). ## Open source jsoup is an open source project distributed under the liberal [MIT license](https://jsoup.org/license). The source code is available on [GitHub](https://github.com/jhy/jsoup). ## Getting started 1. [Download](https://jsoup.org/download) the latest jsoup jar (or add it to your Maven/Gradle build) 2. Read the [cookbook](https://jsoup.org/cookbook/) 3. Enjoy! ### Android support When used in Android projects, [core library desugaring](https://developer.android.com/studio/write/java8-support#library-desugaring) with the [NIO specification](https://developer.android.com/studio/write/java11-nio-support-table) should be enabled to support Java 8+ features. ## Development and support If you have any questions on how to use jsoup, or have ideas for future development, please get in touch via [jsoup Discussions](https://github.com/jhy/jsoup/discussions). If you find any issues, please file a [bug](https://jsoup.org/bugs) after checking for duplicates. The [colophon](https://jsoup.org/colophon) talks about the history of and tools used to build jsoup. ## Status jsoup is in general, stable release. ## Author jsoup was created and is maintained by [Jonathan Hedley](//jhedley.com), its primary author. jsoup is an open-source project, and many contributors have helped improve it over the years. You can see their contributions and join the development on [GitHub](https://github.com/jhy/jsoup/graphs/contributors). ## Citing jsoup If you use jsoup in research or technical documentation, you can cite it as: > **Jonathan Hedley & jsoup contributors. jsoup: Java HTML Parser (2009–present).** Available at: https://jsoup.org ```plaintext @misc{jsoup, author = {Jonathan Hedley and jsoup contributors}, title = {jsoup: Java HTML Parser}, year = {2025}, url = {https://jsoup.org} } ``` ================================================ FILE: SECURITY.md ================================================ # Security Policy ## Supported Versions Security fixes are not back-ported. Please make sure you are running at least the latest [release version](https://jsoup.org/download) of jsoup. Please remember that jsoup is an Open Source library and is provided without any warranty. Before using jsoup in a critical environment, you should satisfy yourself that it works correctly and securely for your needs. ## Reporting a Vulnerability If you believe or suspect you have identified a security vulnerability, please [report it](https://github.com/jhy/jsoup/security/advisories) via the "Report a Vulnerability" button in Security Advisories. ([Details](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing/privately-reporting-a-security-vulnerability)) We follow [Coordinated Disclosure](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing/about-coordinated-disclosure-of-security-vulnerabilities) practices and ask that you do too. Please provide as much detail as possible in your report, including the steps to reproduce the vulnerability and sample code. Alternatively to using GitHub, or if you have a security question, please email `security@jsoup.org`. ## Fixing Vulnerabilities We take all vulnerability reports seriously and strive to fix them as quickly as possible. Once we receive a report, we will verify the vulnerability and its impact. We will then work to develop and test a fix for the vulnerability, and release it as soon as possible. ================================================ FILE: change-archive.txt ================================================ jsoup Changelog Archive Contains change notes for versions 0.1.1 (2010-Jan-31) through 1.17.1 (2023-Nov-27). More recent changes may be found in CHANGES.md. Release 1.17.1 [27-Nov-2023] * Improvement: in Jsoup.connect(), added support for request-level authentication, supporting authentication to proxies and to servers. * Improvement: in the Elements list, added direct support for `#set(index, element)`, `#remove(index)`, `#remove(object)`, `#clear()`, `#removeAll(collection)`, `#retainAll(collection)`, `#removeIf(filter)`, `#replaceAll(operator)`. These methods update the original DOM, as well as the Elements list. * Improvement: added the NodeIterator class, to efficiently traverse a node tree using the Iterator interface. And added Stream Element#stream() and Node#nodeStream() methods, to enable fluent composable stream pipelines of node traversals. * Improvement: when changing the OutputSettings syntax to XML, the xhtml EscapeMode is automatically set by default. * Improvement: added the `:is(selector list)` pseudo-selector, which finds elements that match any of the selectors in the selector list. Useful for making large ORed selectors more readable. * Improvement: repackaged the library with native (vs automatic) JPMS module support. * Improvement: better fidelity of source positions when tracking is enabled. And implicitly created or closed elements are tracked and detectable via Range.isImplicit(). * Improvement: when source tracking is enabled, the source position for attribute names and values is now available. Attribute#sourceRange() provides the ranges. * Improvement: when running concurrently under Java 21+ Virtual Threads, virtual threads could be pinned to their carrier platform thread when parsing an input stream. To improve performance, particularly when parsing fetched URLs, the internal ConstrainableInputStream has been replaced by ControllableInputStream, which avoids the locking which caused that pinning. * Improvement: in Jsoup.Connect, allow any XML mimetype as a supported mimetype. Was previously limited to `{application|text}/xml`. This enables for e.g. fetching SVGs with a image/svg+xml mimetype, without having to disable mimetype validation. * Bugfix: when outputting with XML syntax, HTML elements that were parsed as data nodes ( final Document.OutputSettings.Syntax syntax; // html or xml syntax; affects processing of xml declarations vs as bogus comments final Token.StartTag startPending; final Token.EndTag endPending; Token.Tag tagPending; // tag we are building up: start or end pending final Token.Character charPending = new Token.Character(); final Token.Doctype doctypePending = new Token.Doctype(); // doctype building up final Token.Comment commentPending = new Token.Comment(); // comment building up final Token.XmlDecl xmlDeclPending; // xml decl building up @Nullable private String lastStartTag; // the last start tag emitted, to test appropriate end tag @Nullable private String lastStartCloseSeq; // " 0x10FFFF) { characterReferenceError("character [%s] outside of valid range", charval); codeRef[0] = replacementChar; } else { // fix illegal unicode characters to match browser behavior if (charval >= win1252ExtensionsStart && charval < win1252ExtensionsStart + win1252Extensions.length) { characterReferenceError("character [%s] is not a valid unicode code point", charval); charval = win1252Extensions[charval - win1252ExtensionsStart]; } // todo: implement number replacement table // todo: check for extra illegal unicode points as parse errors codeRef[0] = charval; } return codeRef; } else { // named // get as many letters as possible, and look for matching entities. String nameRef = reader.consumeLetterThenDigitSequence(); boolean looksLegit = reader.matches(';'); // found if a base named entity without a ;, or an extended entity with the ;. boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit)); if (!found) { reader.rewindToMark(); if (looksLegit) // named with semicolon characterReferenceError("invalid named reference [%s]", nameRef); if (inAttribute) return null; // check if there's a base prefix match; consume and use that if so String prefix = Entities.findPrefix(nameRef); if (prefix.isEmpty()) return null; reader.matchConsume(prefix); nameRef = prefix; } if (inAttribute && (reader.matchesAsciiAlpha() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) { // don't want that to match reader.rewindToMark(); return null; } reader.unmark(); if (!reader.matchConsume(";")) characterReferenceError("missing semicolon on [&%s]", nameRef); // missing semi int numChars = Entities.codepointsForName(nameRef, multipointHolder); if (numChars == 1) { codeRef[0] = multipointHolder[0]; return codeRef; } else if (numChars ==2) { return multipointHolder; } else { Validate.fail("Unexpected characters returned for " + nameRef); return multipointHolder; } } } Token.Tag createTagPending(boolean start) { tagPending = start ? startPending.reset() : endPending.reset(); return tagPending; } Token.XmlDecl createXmlDeclPending(boolean isDeclaration) { Token.XmlDecl decl = xmlDeclPending.reset(); decl.isDeclaration = isDeclaration; tagPending = decl; return decl; } void emitTagPending() { tagPending.finaliseTag(); emit(tagPending); } void createCommentPending() { commentPending.reset(); } void emitCommentPending() { emit(commentPending); } void createBogusCommentPending() { commentPending.reset(); commentPending.bogus = true; } void createDoctypePending() { doctypePending.reset(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer.reset(); } boolean isAppropriateEndTagToken() { return lastStartTag != null && tagPending.name().equalsIgnoreCase(lastStartTag); } @Nullable String appropriateEndTagName() { return lastStartTag; // could be null } /** Returns the closer sequence {@code ')) { t.error(this); t.advanceTransition(Data); } else { t.error(this); t.createBogusCommentPending(); t.commentPending.append('/'); // push the / back on that got us here t.transition(BogusComment); } } }, TagName { // from < or ': t.emitTagPending(); t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); break; default: // buffer underrun t.tagPending.appendTagName(c); } } }, RcdataLessthanSign { // from < in rcdata @Override void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); } else if (r.readFully() && r.matchesAsciiAlpha() && t.appropriateEndTagName() != null && !r.containsIgnoreCase(t.appropriateEndTagSeq())) { // diverge from spec: got a start tag, but there's no appropriate end tag (), so rather than // consuming to EOF; break out here t.tagPending = t.createTagPending(false).name(t.appropriateEndTagName()); t.emitTagPending(); t.transition(TagOpen); // straight into TagOpen, as we came from < and looks like we're on a start tag } else { t.emit('<'); t.transition(Rcdata); } } }, RCDATAEndTagOpen { @Override void read(Tokeniser t, CharacterReader r) { if (r.matchesAsciiAlpha()) { t.createTagPending(false); t.tagPending.appendTagName(r.current()); t.dataBuffer.append(r.current()); t.advanceTransition(RCDATAEndTagName); } else { t.emit("': if (t.isAppropriateEndTagToken()) { t.emitTagPending(); t.transition(Data); } else anythingElse(t, r); break; default: anythingElse(t, r); } } private void anythingElse(Tokeniser t, CharacterReader r) { t.emit("': t.emit(c); t.transition(ScriptData); break; case nullChar: t.error(this); t.emit(replacementChar); t.transition(ScriptDataEscaped); break; default: t.emit(c); t.transition(ScriptDataEscaped); } } }, ScriptDataEscapedLessthanSign { @Override void read(Tokeniser t, CharacterReader r) { if (r.matchesAsciiAlpha()) { t.createTempBuffer(); t.dataBuffer.append(r.current()); t.emit('<'); t.emit(r.current()); t.advanceTransition(ScriptDataDoubleEscapeStart); } else if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(ScriptDataEscapedEndTagOpen); } else { t.emit('<'); t.transition(ScriptDataEscaped); } } }, ScriptDataEscapedEndTagOpen { @Override void read(Tokeniser t, CharacterReader r) { if (r.matchesAsciiAlpha()) { t.createTagPending(false); t.tagPending.appendTagName(r.current()); t.dataBuffer.append(r.current()); t.advanceTransition(ScriptDataEscapedEndTagName); } else { t.emit("': t.emit(c); t.transition(ScriptData); break; case nullChar: t.error(this); t.emit(replacementChar); t.transition(ScriptDataDoubleEscaped); break; case eof: t.eofError(this); t.transition(Data); break; default: t.emit(c); t.transition(ScriptDataDoubleEscaped); } } }, ScriptDataDoubleEscapedLessthanSign { @Override void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.emit('/'); t.createTempBuffer(); t.advanceTransition(ScriptDataDoubleEscapeEnd); } else { t.transition(ScriptDataDoubleEscaped); } } }, ScriptDataDoubleEscapeEnd { @Override void read(Tokeniser t, CharacterReader r) { handleDataDoubleEscapeTag(t,r, ScriptDataEscaped, ScriptDataDoubleEscaped); } }, BeforeAttributeName { // from tagname ': t.emitTagPending(); t.transition(Data); break; case nullChar: r.unconsume(); t.error(this); t.tagPending.newAttribute(); t.transition(AttributeName); break; case eof: t.eofError(this); t.transition(Data); break; case '"': case '\'': case '=': t.error(this); t.tagPending.newAttribute(); t.tagPending.appendAttributeName(c, r.pos()-1, r.pos()); t.transition(AttributeName); break; case '?': // Handle trailing ? in if (t.tagPending instanceof Token.XmlDecl) break; // otherwise fall through to default default: // A-Z, anything else t.tagPending.newAttribute(); r.unconsume(); t.transition(AttributeName); } } }, AttributeName { // from before attribute name @Override void read(Tokeniser t, CharacterReader r) { int pos = r.pos(); String name = r.consumeToAnySorted(attributeNameCharsSorted); // spec deviate - consume and emit nulls in one hit vs stepping t.tagPending.appendAttributeName(name, pos, r.pos()); pos = r.pos(); char c = r.consume(); switch (c) { case '\t': case '\n': case '\r': case '\f': case ' ': t.transition(AfterAttributeName); break; case '/': t.transition(SelfClosingStartTag); break; case '=': t.transition(BeforeAttributeValue); break; case '>': t.emitTagPending(); t.transition(Data); break; case eof: t.eofError(this); t.transition(Data); break; case '"': case '\'': case '<': t.error(this); t.tagPending.appendAttributeName(c, pos, r.pos()); break; case '?': if (t.syntax == xml && t.tagPending instanceof Token.XmlDecl) { t.transition(AfterAttributeName); break; } // otherwise default - take it default: // buffer underrun t.tagPending.appendAttributeName(c, pos, r.pos()); } } }, AfterAttributeName { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '\t': case '\n': case '\r': case '\f': case ' ': // ignore break; case '/': t.transition(SelfClosingStartTag); break; case '=': t.transition(BeforeAttributeValue); break; case '>': t.emitTagPending(); t.transition(Data); break; case nullChar: t.error(this); t.tagPending.appendAttributeName(replacementChar, r.pos()-1, r.pos()); t.transition(AttributeName); break; case eof: t.eofError(this); t.transition(Data); break; case '"': case '\'': case '<': t.error(this); t.tagPending.newAttribute(); t.tagPending.appendAttributeName(c, r.pos()-1, r.pos()); t.transition(AttributeName); break; default: // A-Z, anything else t.tagPending.newAttribute(); r.unconsume(); t.transition(AttributeName); } } }, BeforeAttributeValue { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '\t': case '\n': case '\r': case '\f': case ' ': // ignore break; case '"': t.transition(AttributeValue_doubleQuoted); break; case '&': r.unconsume(); t.transition(AttributeValue_unquoted); break; case '\'': t.transition(AttributeValue_singleQuoted); break; case nullChar: t.error(this); t.tagPending.appendAttributeValue(replacementChar, r.pos()-1, r.pos()); t.transition(AttributeValue_unquoted); break; case eof: t.eofError(this); t.emitTagPending(); t.transition(Data); break; case '>': t.error(this); t.emitTagPending(); t.transition(Data); break; case '<': case '=': case '`': t.error(this); t.tagPending.appendAttributeValue(c, r.pos()-1, r.pos()); t.transition(AttributeValue_unquoted); break; default: r.unconsume(); t.transition(AttributeValue_unquoted); } } }, AttributeValue_doubleQuoted { @Override void read(Tokeniser t, CharacterReader r) { int pos = r.pos(); String value = r.consumeAttributeQuoted(false); if (value.length() > 0) t.tagPending.appendAttributeValue(value, pos, r.pos()); else t.tagPending.setEmptyAttributeValue(); pos = r.pos(); char c = r.consume(); switch (c) { case '"': t.transition(AfterAttributeValue_quoted); break; case '&': int[] ref = t.consumeCharacterReference('"', true); if (ref != null) t.tagPending.appendAttributeValue(ref, pos, r.pos()); else t.tagPending.appendAttributeValue('&', pos, r.pos()); break; case nullChar: t.error(this); t.tagPending.appendAttributeValue(replacementChar, pos, r.pos()); break; case eof: t.eofError(this); t.transition(Data); break; default: // hit end of buffer in first read, still in attribute t.tagPending.appendAttributeValue(c, pos, r.pos()); } } }, AttributeValue_singleQuoted { @Override void read(Tokeniser t, CharacterReader r) { int pos = r.pos(); String value = r.consumeAttributeQuoted(true); if (value.length() > 0) t.tagPending.appendAttributeValue(value, pos, r.pos()); else t.tagPending.setEmptyAttributeValue(); pos = r.pos(); char c = r.consume(); switch (c) { case '\'': t.transition(AfterAttributeValue_quoted); break; case '&': int[] ref = t.consumeCharacterReference('\'', true); if (ref != null) t.tagPending.appendAttributeValue(ref, pos, r.pos()); else t.tagPending.appendAttributeValue('&', pos, r.pos()); break; case nullChar: t.error(this); t.tagPending.appendAttributeValue(replacementChar, pos, r.pos()); break; case eof: t.eofError(this); t.transition(Data); break; default: // hit end of buffer in first read, still in attribute t.tagPending.appendAttributeValue(c, pos, r.pos()); } } }, AttributeValue_unquoted { @Override void read(Tokeniser t, CharacterReader r) { int pos = r.pos(); String value = r.consumeToAnySorted(attributeValueUnquoted); if (value.length() > 0) t.tagPending.appendAttributeValue(value, pos, r.pos()); pos = r.pos(); char c = r.consume(); switch (c) { case '\t': case '\n': case '\r': case '\f': case ' ': t.transition(BeforeAttributeName); break; case '&': int[] ref = t.consumeCharacterReference('>', true); if (ref != null) t.tagPending.appendAttributeValue(ref, pos, r.pos()); else t.tagPending.appendAttributeValue('&', pos, r.pos()); break; case '>': t.emitTagPending(); t.transition(Data); break; case nullChar: t.error(this); t.tagPending.appendAttributeValue(replacementChar, pos, r.pos()); break; case eof: t.eofError(this); t.transition(Data); break; case '"': case '\'': case '<': case '=': case '`': t.error(this); t.tagPending.appendAttributeValue(c, pos, r.pos()); break; default: // hit end of buffer in first read, still in attribute t.tagPending.appendAttributeValue(c, pos, r.pos()); } } }, // CharacterReferenceInAttributeValue state handled inline AfterAttributeValue_quoted { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '\t': case '\n': case '\r': case '\f': case ' ': t.transition(BeforeAttributeName); break; case '/': t.transition(SelfClosingStartTag); break; case '>': t.emitTagPending(); t.transition(Data); break; case eof: t.eofError(this); t.transition(Data); break; case '?': // Handle trailing ? in if (t.tagPending instanceof Token.XmlDecl) break; // otherwise fall through to default default: r.unconsume(); t.error(this); t.transition(BeforeAttributeName); } } }, SelfClosingStartTag { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '>': t.tagPending.selfClosing = true; t.emitTagPending(); t.transition(Data); break; case eof: t.eofError(this); t.transition(Data); break; default: r.unconsume(); t.error(this); t.transition(BeforeAttributeName); } } }, BogusComment { @Override void read(Tokeniser t, CharacterReader r) { // todo: handle bogus comment starting from eof. when does that trigger? t.commentPending.append(r.consumeTo('>')); // todo: replace nullChar with replaceChar char next = r.current(); if (next == '>' || next == eof) { r.consume(); t.emitCommentPending(); t.transition(Data); } } }, MarkupDeclarationOpen { // from ': t.error(this); t.emitCommentPending(); t.transition(Data); break; case eof: t.eofError(this); t.emitCommentPending(); t.transition(Data); break; default: r.unconsume(); t.transition(Comment); } } }, CommentStartDash { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '-': t.transition(CommentEnd); break; case nullChar: t.error(this); t.commentPending.append(replacementChar); t.transition(Comment); break; case '>': t.error(this); t.emitCommentPending(); t.transition(Data); break; case eof: t.eofError(this); t.emitCommentPending(); t.transition(Data); break; default: t.commentPending.append(c); t.transition(Comment); } } }, Comment { @Override void read(Tokeniser t, CharacterReader r) { char c = r.current(); switch (c) { case '-': t.advanceTransition(CommentEndDash); break; case nullChar: t.error(this); r.advance(); t.commentPending.append(replacementChar); break; case eof: t.eofError(this); t.emitCommentPending(); t.transition(Data); break; default: t.commentPending.append(r.consumeToAny('-', nullChar)); } } }, CommentEndDash { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '-': t.transition(CommentEnd); break; case nullChar: t.error(this); t.commentPending.append('-').append(replacementChar); t.transition(Comment); break; case eof: t.eofError(this); t.emitCommentPending(); t.transition(Data); break; default: t.commentPending.append('-').append(c); t.transition(Comment); } } }, CommentEnd { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '>': t.emitCommentPending(); t.transition(Data); break; case nullChar: t.error(this); t.commentPending.append("--").append(replacementChar); t.transition(Comment); break; case '!': t.transition(CommentEndBang); break; case '-': t.commentPending.append('-'); break; case eof: t.eofError(this); t.emitCommentPending(); t.transition(Data); break; default: t.commentPending.append("--").append(c); t.transition(Comment); } } }, CommentEndBang { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '-': t.commentPending.append("--!"); t.transition(CommentEndDash); break; case '>': t.emitCommentPending(); t.transition(Data); break; case nullChar: t.error(this); t.commentPending.append("--!").append(replacementChar); t.transition(Comment); break; case eof: t.eofError(this); t.emitCommentPending(); t.transition(Data); break; default: t.commentPending.append("--!").append(c); t.transition(Comment); } } }, Doctype { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '\t': case '\n': case '\r': case '\f': case ' ': t.transition(BeforeDoctypeName); break; case eof: t.eofError(this); // note: fall through to > case case '>': // catch invalid t.error(this); t.createDoctypePending(); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; default: t.error(this); t.transition(BeforeDoctypeName); } } }, BeforeDoctypeName { @Override void read(Tokeniser t, CharacterReader r) { if (r.matchesAsciiAlpha()) { t.createDoctypePending(); t.transition(DoctypeName); return; } char c = r.consume(); switch (c) { case '\t': case '\n': case '\r': case '\f': case ' ': break; // ignore whitespace case nullChar: t.error(this); t.createDoctypePending(); t.doctypePending.name.append(replacementChar); t.transition(DoctypeName); break; case eof: t.eofError(this); t.createDoctypePending(); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; default: t.createDoctypePending(); t.doctypePending.name.append(c); t.transition(DoctypeName); } } }, DoctypeName { @Override void read(Tokeniser t, CharacterReader r) { if (r.matchesAsciiAlpha()) { String name = r.consumeLetterSequence(); t.doctypePending.name.append(name); return; } char c = r.consume(); switch (c) { case '>': t.emitDoctypePending(); t.transition(Data); break; case '\t': case '\n': case '\r': case '\f': case ' ': t.transition(AfterDoctypeName); break; case nullChar: t.error(this); t.doctypePending.name.append(replacementChar); break; case eof: t.eofError(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; default: t.doctypePending.name.append(c); } } }, AfterDoctypeName { @Override void read(Tokeniser t, CharacterReader r) { if (r.isEmpty()) { t.eofError(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); return; } if (r.matchesAny('\t', '\n', '\r', '\f', ' ')) r.advance(); // ignore whitespace else if (r.matches('>')) { t.emitDoctypePending(); t.advanceTransition(Data); } else if (r.matchConsumeIgnoreCase(DocumentType.PUBLIC_KEY)) { t.doctypePending.pubSysKey = DocumentType.PUBLIC_KEY; t.transition(AfterDoctypePublicKeyword); } else if (r.matchConsumeIgnoreCase(DocumentType.SYSTEM_KEY)) { t.doctypePending.pubSysKey = DocumentType.SYSTEM_KEY; t.transition(AfterDoctypeSystemKeyword); } else { t.error(this); t.doctypePending.forceQuirks = true; t.advanceTransition(BogusDoctype); } } }, AfterDoctypePublicKeyword { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '\t': case '\n': case '\r': case '\f': case ' ': t.transition(BeforeDoctypePublicIdentifier); break; case '"': t.error(this); // set public id to empty string t.transition(DoctypePublicIdentifier_doubleQuoted); break; case '\'': t.error(this); // set public id to empty string t.transition(DoctypePublicIdentifier_singleQuoted); break; case '>': t.error(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; case eof: t.eofError(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; default: t.error(this); t.doctypePending.forceQuirks = true; t.transition(BogusDoctype); } } }, BeforeDoctypePublicIdentifier { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '\t': case '\n': case '\r': case '\f': case ' ': break; case '"': // set public id to empty string t.transition(DoctypePublicIdentifier_doubleQuoted); break; case '\'': // set public id to empty string t.transition(DoctypePublicIdentifier_singleQuoted); break; case '>': t.error(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; case eof: t.eofError(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; default: t.error(this); t.doctypePending.forceQuirks = true; t.transition(BogusDoctype); } } }, DoctypePublicIdentifier_doubleQuoted { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '"': t.transition(AfterDoctypePublicIdentifier); break; case nullChar: t.error(this); t.doctypePending.publicIdentifier.append(replacementChar); break; case '>': t.error(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; case eof: t.eofError(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; default: t.doctypePending.publicIdentifier.append(c); } } }, DoctypePublicIdentifier_singleQuoted { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '\'': t.transition(AfterDoctypePublicIdentifier); break; case nullChar: t.error(this); t.doctypePending.publicIdentifier.append(replacementChar); break; case '>': t.error(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; case eof: t.eofError(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; default: t.doctypePending.publicIdentifier.append(c); } } }, AfterDoctypePublicIdentifier { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '\t': case '\n': case '\r': case '\f': case ' ': t.transition(BetweenDoctypePublicAndSystemIdentifiers); break; case '>': t.emitDoctypePending(); t.transition(Data); break; case '"': t.error(this); // system id empty t.transition(DoctypeSystemIdentifier_doubleQuoted); break; case '\'': t.error(this); // system id empty t.transition(DoctypeSystemIdentifier_singleQuoted); break; case eof: t.eofError(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; default: t.error(this); t.doctypePending.forceQuirks = true; t.transition(BogusDoctype); } } }, BetweenDoctypePublicAndSystemIdentifiers { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '\t': case '\n': case '\r': case '\f': case ' ': break; case '>': t.emitDoctypePending(); t.transition(Data); break; case '"': t.error(this); // system id empty t.transition(DoctypeSystemIdentifier_doubleQuoted); break; case '\'': t.error(this); // system id empty t.transition(DoctypeSystemIdentifier_singleQuoted); break; case eof: t.eofError(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; default: t.error(this); t.doctypePending.forceQuirks = true; t.transition(BogusDoctype); } } }, AfterDoctypeSystemKeyword { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '\t': case '\n': case '\r': case '\f': case ' ': t.transition(BeforeDoctypeSystemIdentifier); break; case '>': t.error(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; case '"': t.error(this); // system id empty t.transition(DoctypeSystemIdentifier_doubleQuoted); break; case '\'': t.error(this); // system id empty t.transition(DoctypeSystemIdentifier_singleQuoted); break; case eof: t.eofError(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; default: t.error(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); } } }, BeforeDoctypeSystemIdentifier { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '\t': case '\n': case '\r': case '\f': case ' ': break; case '"': // set system id to empty string t.transition(DoctypeSystemIdentifier_doubleQuoted); break; case '\'': // set public id to empty string t.transition(DoctypeSystemIdentifier_singleQuoted); break; case '>': t.error(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; case eof: t.eofError(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; default: t.error(this); t.doctypePending.forceQuirks = true; t.transition(BogusDoctype); } } }, DoctypeSystemIdentifier_doubleQuoted { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '"': t.transition(AfterDoctypeSystemIdentifier); break; case nullChar: t.error(this); t.doctypePending.systemIdentifier.append(replacementChar); break; case '>': t.error(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; case eof: t.eofError(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; default: t.doctypePending.systemIdentifier.append(c); } } }, DoctypeSystemIdentifier_singleQuoted { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '\'': t.transition(AfterDoctypeSystemIdentifier); break; case nullChar: t.error(this); t.doctypePending.systemIdentifier.append(replacementChar); break; case '>': t.error(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; case eof: t.eofError(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; default: t.doctypePending.systemIdentifier.append(c); } } }, AfterDoctypeSystemIdentifier { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '\t': case '\n': case '\r': case '\f': case ' ': break; case '>': t.emitDoctypePending(); t.transition(Data); break; case eof: t.eofError(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; default: t.error(this); t.transition(BogusDoctype); // NOT force quirks } } }, BogusDoctype { @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '>': t.emitDoctypePending(); t.transition(Data); break; case eof: t.emitDoctypePending(); t.transition(Data); break; default: // ignore char break; } } }, CdataSection { @Override void read(Tokeniser t, CharacterReader r) { String data = r.consumeTo("]]>"); t.dataBuffer.append(data); if (r.matchConsume("]]>") || r.isEmpty()) { t.emit(new Token.CData(t.dataBuffer.value())); t.transition(Data); }// otherwise, buffer underrun, stay in data section } }; abstract void read(Tokeniser t, CharacterReader r); static final char nullChar = '\u0000'; // char searches. must be sorted, used in inSorted. MUST update TokeniserStateTest if more arrays are added. static final char[] attributeNameCharsSorted = new char[]{'\t', '\n', '\f', '\r', ' ', '"', '\'', '/', '<', '=', '>', '?'}; static final char[] attributeValueUnquoted = new char[]{nullChar, '\t', '\n', '\f', '\r', ' ', '"', '&', '\'', '<', '=', '>', '`'}; private static final char replacementChar = Tokeniser.replacementChar; private static final String replacementStr = String.valueOf(Tokeniser.replacementChar); private static final char eof = CharacterReader.EOF; /** * Handles RawtextEndTagName, ScriptDataEndTagName, and ScriptDataEscapedEndTagName. Same body impl, just * different else exit transitions. */ private static void handleDataEndTag(Tokeniser t, CharacterReader r, TokeniserState elseTransition) { if (r.matchesAsciiAlpha()) { String name = r.consumeTagName(); t.tagPending.appendTagName(name); t.dataBuffer.append(name); return; } boolean needsExitTransition = false; if (t.isAppropriateEndTagToken() && !r.isEmpty()) { char c = r.consume(); switch (c) { case '\t': case '\n': case '\r': case '\f': case ' ': t.transition(BeforeAttributeName); break; case '/': t.transition(SelfClosingStartTag); break; case '>': t.emitTagPending(); t.transition(Data); break; default: t.dataBuffer.append(c); needsExitTransition = true; } } else { needsExitTransition = true; } if (needsExitTransition) { t.emit("': if (t.dataBuffer.value().equals("script")) t.transition(primary); else t.transition(fallback); t.emit(c); break; default: r.unconsume(); t.transition(fallback); } } } ================================================ FILE: src/main/java/org/jsoup/parser/TreeBuilder.java ================================================ package org.jsoup.parser; import org.jsoup.helper.Validate; import org.jsoup.internal.SharedConstants; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.Range; import org.jsoup.select.NodeVisitor; import org.jspecify.annotations.Nullable; import java.io.Reader; import java.util.ArrayList; import java.util.List; import static org.jsoup.parser.Parser.NamespaceHtml; /** * @author Jonathan Hedley */ abstract class TreeBuilder { protected Parser parser; CharacterReader reader; Tokeniser tokeniser; Document doc; // current doc we are building into ArrayList stack; // the stack of open elements String baseUri; // current base uri, for creating new elements Token currentToken; // currentToken is used for error and source position tracking. Null at start of fragment parse ParseSettings settings; TagSet tagSet; // the tags we're using in this parse @Nullable NodeVisitor nodeListener; // optional listener for node add / removes private Token.StartTag start; // start tag to process private final Token.EndTag end = new Token.EndTag(this); abstract ParseSettings defaultSettings(); boolean trackSourceRange; // optionally tracks the source range of nodes and attributes void initialiseParse(Reader input, String baseUri, Parser parser) { Validate.notNullParam(input, "input"); Validate.notNullParam(baseUri, "baseUri"); Validate.notNull(parser); doc = new Document(parser.defaultNamespace(), baseUri); doc.parser(parser); this.parser = parser; settings = parser.settings(); reader = new CharacterReader(input); trackSourceRange = parser.isTrackPosition(); reader.trackNewlines(parser.isTrackErrors() || trackSourceRange); // when tracking errors or source ranges, enable newline tracking for better legibility if (parser.isTrackErrors()) parser.getErrors().clear(); tokeniser = new Tokeniser(this); stack = new ArrayList<>(32); tagSet = parser.tagSet(); start = new Token.StartTag(this); currentToken = start; // init current token to the virtual start token. this.baseUri = baseUri; onNodeInserted(doc); } void completeParse() { // tidy up - as the Parser and Treebuilder are retained in document for settings / fragments if (reader == null) return; reader.close(); reader = null; tokeniser = null; stack = null; } Document parse(Reader input, String baseUri, Parser parser) { initialiseParse(input, baseUri, parser); runParser(); return doc; } List parseFragment(Reader inputFragment, @Nullable Element context, String baseUri, Parser parser) { initialiseParse(inputFragment, baseUri, parser); initialiseParseFragment(context); runParser(); return completeParseFragment(); } void initialiseParseFragment(@Nullable Element context) { // in Html, sets up context; no-op in XML } abstract List completeParseFragment(); /** Set the node listener, which will then get callbacks for node insert and removals. */ void nodeListener(NodeVisitor nodeListener) { this.nodeListener = nodeListener; } /** Create a new copy of this TreeBuilder @return copy, ready for a new parse */ abstract TreeBuilder newInstance(); void runParser() { do {} while (stepParser()); // run until stepParser sees EOF completeParse(); } boolean stepParser() { // if we have reached the end already, step by popping off the stack, to hit nodeRemoved callbacks: if (currentToken.type == Token.TokenType.EOF) { if (stack == null) { return false; } if (stack.isEmpty()) { onNodeClosed(doc); // the root doc is not on the stack, so let this final step close it stack = null; return true; } pop(); return true; } final Token token = tokeniser.read(); currentToken = token; process(token); token.reset(); return true; } abstract boolean process(Token token); boolean processStartTag(String name) { // these are "virtual" start tags (auto-created by the treebuilder), so not tracking the start position final Token.StartTag start = this.start; if (currentToken == start) { // don't recycle an in-use token return process(new Token.StartTag(this).name(name)); } return process(start.reset().name(name)); } boolean processStartTag(String name, Attributes attrs) { final Token.StartTag start = this.start; if (currentToken == start) { // don't recycle an in-use token return process(new Token.StartTag(this).nameAttr(name, attrs)); } start.reset(); start.nameAttr(name, attrs); return process(start); } boolean processEndTag(String name) { if (currentToken == end) { // don't recycle an in-use token return process(new Token.EndTag(this).name(name)); } return process(end.reset().name(name)); } /** Removes the last Element from the stack, hits onNodeClosed, and then returns it. * @return */ Element pop() { int size = stack.size(); Element removed = stack.remove(size - 1); onNodeClosed(removed); return removed; } /** Adds the specified Element to the end of the stack, and hits onNodeInserted. * @param element */ final void push(Element element) { stack.add(element); onNodeInserted(element); } /** Ensures the stack respects {@link Parser#getMaxDepth()} by closing the deepest open elements until there is room for a new insertion. */ final void enforceStackDepthLimit() { final int maxDepth = parser.getMaxDepth(); if (maxDepth == Integer.MAX_VALUE) return; while (stack.size() >= maxDepth) { Element trimmed = pop(); onStackPrunedForDepth(trimmed); } } /** Hook for the HTML Tree Builder that needs to clean up when an element is removed due to the depth limit */ void onStackPrunedForDepth(Element element) { // default no-op } /** Default maximum depth for parsers using this tree builder. */ int defaultMaxDepth() { return 512; } /** Get the current element (last on the stack). If all items have been removed, returns the document instead (which might not actually be on the stack; use stack.size() == 0 to test if required. @return the last element on the stack, if any; or the root document */ Element currentElement() { int size = stack.size(); return size > 0 ? stack.get(size-1) : doc; } /** Checks if the Current Element's normal name equals the supplied name, in the HTML namespace. @param normalName name to check @return true if there is a current element on the stack, and its name equals the supplied */ boolean currentElementIs(String normalName) { if (stack.size() == 0) return false; Element current = currentElement(); return current != null && current.normalName().equals(normalName) && current.tag().namespace().equals(NamespaceHtml); } /** Checks if the Current Element's normal name equals the supplied name, in the specified namespace. @param normalName name to check @param namespace the namespace @return true if there is a current element on the stack, and its name equals the supplied */ boolean currentElementIs(String normalName, String namespace) { if (stack.size() == 0) return false; Element current = currentElement(); return current != null && current.normalName().equals(normalName) && current.tag().namespace().equals(namespace); } /** * If the parser is tracking errors, add an error at the current position. * @param msg error message */ void error(String msg) { error(msg, (Object[]) null); } /** * If the parser is tracking errors, add an error at the current position. * @param msg error message template * @param args template arguments */ void error(String msg, Object... args) { ParseErrorList errors = parser.getErrors(); if (errors.canAddError()) errors.add(new ParseError(reader, msg, args)); } Tag tagFor(String tagName, String normalName, String namespace, ParseSettings settings) { return tagSet.valueOf(tagName, normalName, namespace, settings.preserveTagCase()); } Tag tagFor(Token.Tag token) { return tagSet.valueOf(token.name(), token.normalName, defaultNamespace(), settings.preserveTagCase()); } /** Gets the default namespace for this TreeBuilder * @return the default namespace */ String defaultNamespace() { return NamespaceHtml; } TagSet defaultTagSet() { return TagSet.Html(); } /** Called by implementing TreeBuilders when a node has been inserted. This implementation includes optionally tracking the source range of the node. @param node the node that was just inserted */ void onNodeInserted(Node node) { trackNodePosition(node, true); if (nodeListener != null) nodeListener.head(node, stack.size()); } /** Called by implementing TreeBuilders when a node is explicitly closed. This implementation includes optionally tracking the closing source range of the node. @param node the node being closed */ void onNodeClosed(Node node) { trackNodePosition(node, false); if (nodeListener != null) nodeListener.tail(node, stack.size()); } void trackNodePosition(Node node, boolean isStart) { if (!trackSourceRange) return; final Token token = currentToken; int startPos = token.startPos(); int endPos = token.endPos(); // handle implicit element open / closes. if (node instanceof Element) { final Element el = (Element) node; if (token.isEOF()) { if (el.endSourceRange().isTracked()) return; // /body and /html are left on stack until EOF, don't reset them startPos = endPos = reader.pos(); } else if (isStart) { // opening tag if (!token.isStartTag() || !el.normalName().equals(token.asStartTag().normalName)) { endPos = startPos; } } else { // closing tag if (!el.tag().isEmpty() && !el.tag().isSelfClosing()) { if (!token.isEndTag() || !el.normalName().equals(token.asEndTag().normalName)) { endPos = startPos; } } } } Range.Position startPosition = new Range.Position (startPos, reader.lineNumber(startPos), reader.columnNumber(startPos)); Range.Position endPosition = new Range.Position (endPos, reader.lineNumber(endPos), reader.columnNumber(endPos)); Range range = new Range(startPosition, endPosition); node.attributes().userData(isStart ? SharedConstants.RangeKey : SharedConstants.EndRangeKey, range); } } ================================================ FILE: src/main/java/org/jsoup/parser/XmlTreeBuilder.java ================================================ package org.jsoup.parser; import org.jsoup.helper.Validate; import org.jsoup.internal.SharedConstants; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.CDataNode; import org.jsoup.nodes.Comment; import org.jsoup.nodes.DataNode; import org.jsoup.nodes.Document; import org.jsoup.nodes.DocumentType; import org.jsoup.nodes.Element; import org.jsoup.nodes.Entities; import org.jsoup.nodes.LeafNode; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.nodes.XmlDeclaration; import org.jsoup.select.Elements; import org.jspecify.annotations.Nullable; import java.io.Reader; import java.io.StringReader; import java.util.ArrayDeque; import java.util.HashMap; import java.util.List; import java.util.Map; import static org.jsoup.parser.Parser.NamespaceXml; /** * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the * document. *

Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}

* * @author Jonathan Hedley */ public class XmlTreeBuilder extends TreeBuilder { static final String XmlnsKey = "xmlns"; static final String XmlnsPrefix = "xmlns:"; private final ArrayDeque> namespacesStack = new ArrayDeque<>(); // stack of namespaces, prefix => urn @Override ParseSettings defaultSettings() { return ParseSettings.preserveCase; } @Override protected void initialiseParse(Reader input, String baseUri, Parser parser) { super.initialiseParse(input, baseUri, parser); doc.outputSettings() .syntax(Document.OutputSettings.Syntax.xml) .escapeMode(Entities.EscapeMode.xhtml) .prettyPrint(false); // as XML, we don't understand what whitespace is significant or not namespacesStack.clear(); HashMap ns = new HashMap<>(); ns.put("xml", NamespaceXml); ns.put("", NamespaceXml); namespacesStack.push(ns); } @Override void initialiseParseFragment(@Nullable Element context) { super.initialiseParseFragment(context); if (context == null) return; // transition to the tag's text state if available TokeniserState textState = context.tag().textState(); if (textState != null) tokeniser.transition(textState); // reconstitute the namespace stack by traversing the element and its parents (top down) Elements chain = context.parents(); chain.add(0, context); for (int i = chain.size() - 1; i >= 0; i--) { Element el = chain.get(i); HashMap namespaces = new HashMap<>(namespacesStack.peek()); namespacesStack.push(namespaces); if (el.attributesSize() > 0) { processNamespaces(el.attributes(), namespaces); } } } Document parse(Reader input, String baseUri) { return parse(input, baseUri, new Parser(this)); } Document parse(String input, String baseUri) { return parse(new StringReader(input), baseUri, new Parser(this)); } @Override List completeParseFragment() { return doc.childNodes(); } @Override XmlTreeBuilder newInstance() { return new XmlTreeBuilder(); } @Override public String defaultNamespace() { return NamespaceXml; } @Override TagSet defaultTagSet() { return new TagSet(); // an empty tagset } @Override int defaultMaxDepth() { return Integer.MAX_VALUE; } @Override protected boolean process(Token token) { currentToken = token; // start tag, end tag, doctype, xmldecl, comment, character, eof switch (token.type) { case StartTag: insertElementFor(token.asStartTag()); break; case EndTag: popStackToClose(token.asEndTag()); break; case Comment: insertCommentFor(token.asComment()); break; case Character: insertCharacterFor(token.asCharacter()); break; case Doctype: insertDoctypeFor(token.asDoctype()); break; case XmlDecl: insertXmlDeclarationFor(token.asXmlDecl()); break; case EOF: // could put some normalisation here if desired break; default: Validate.fail("Unexpected token type: " + token.type); } return true; } void insertElementFor(Token.StartTag startTag) { // handle namespace for tag HashMap namespaces = new HashMap<>(namespacesStack.peek()); namespacesStack.push(namespaces); Attributes attributes = startTag.attributes; if (attributes != null) { settings.normalizeAttributes(attributes); attributes.deduplicate(settings); processNamespaces(attributes, namespaces); applyNamespacesToAttributes(attributes, namespaces); } enforceStackDepthLimit(); String tagName = startTag.tagName.value(); String ns = resolveNamespace(tagName, namespaces); Tag tag = tagFor(tagName, startTag.normalName, ns, settings); Element el = new Element(tag, null, attributes); currentElement().appendChild(el); push(el); if (startTag.isSelfClosing()) { tag.setSeenSelfClose(); pop(); // push & pop ensures onNodeInserted & onNodeClosed } else if (tag.isEmpty()) { pop(); // custom defined void tag } else { TokeniserState textState = tag.textState(); if (textState != null) tokeniser.transition(textState); } } private static void processNamespaces(Attributes attributes, HashMap namespaces) { // process attributes for namespaces (xmlns, xmlns:) for (Attribute attr : attributes) { String key = attr.getKey(); String value = attr.getValue(); if (key.equals(XmlnsKey)) { namespaces.put("", value); // new default for this level } else if (key.startsWith(XmlnsPrefix)) { String nsPrefix = key.substring(XmlnsPrefix.length()); namespaces.put(nsPrefix, value); } } } private static void applyNamespacesToAttributes(Attributes attributes, HashMap namespaces) { // second pass, apply namespace to attributes. Collects them first then adds (as userData is an attribute) Map attrPrefix = new HashMap<>(); for (Attribute attr: attributes) { String prefix = attr.prefix(); if (!prefix.isEmpty()) { if (prefix.equals(XmlnsKey)) continue; String ns = namespaces.get(prefix); if (ns != null) attrPrefix.put(SharedConstants.XmlnsAttr + prefix, ns); } } for (Map.Entry entry : attrPrefix.entrySet()) attributes.userData(entry.getKey(), entry.getValue()); } private static String resolveNamespace(String tagName, HashMap namespaces) { String ns = namespaces.get(""); int pos = tagName.indexOf(':'); if (pos > 0) { String prefix = tagName.substring(0, pos); if (namespaces.containsKey(prefix)) ns = namespaces.get(prefix); } return ns; } void insertLeafNode(LeafNode node) { currentElement().appendChild(node); onNodeInserted(node); } void insertCommentFor(Token.Comment commentToken) { Comment comment = new Comment(commentToken.getData()); insertLeafNode(comment); } void insertCharacterFor(Token.Character token) { final String data = token.getData(); LeafNode node; if (token.isCData()) node = new CDataNode(data); else if (currentElement().tag().is(Tag.Data)) node = new DataNode(data); else node = new TextNode(data); insertLeafNode(node); } void insertDoctypeFor(Token.Doctype token) { DocumentType doctypeNode = new DocumentType(settings.normalizeTag(token.getName()), token.getPublicIdentifier(), token.getSystemIdentifier()); doctypeNode.setPubSysKey(token.getPubSysKey()); insertLeafNode(doctypeNode); } void insertXmlDeclarationFor(Token.XmlDecl token) { XmlDeclaration decl = new XmlDeclaration(token.name(), token.isDeclaration); if (token.attributes != null) decl.attributes().addAll(token.attributes); insertLeafNode(decl); } @Override Element pop() { namespacesStack.pop(); return super.pop(); } /** * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not * found, skips. * * @param endTag tag to close */ protected void popStackToClose(Token.EndTag endTag) { // like in HtmlTreeBuilder - don't scan up forever for very (artificially) deeply nested stacks String elName = settings.normalizeTag(endTag.name()); Element firstFound = null; final int bottom = stack.size() - 1; final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; for (int pos = stack.size() -1; pos >= upper; pos--) { Element next = stack.get(pos); if (next.nodeName().equals(elName)) { firstFound = next; break; } } if (firstFound == null) return; // not found, skip for (int pos = stack.size() -1; pos >= 0; pos--) { Element next = pop(); if (next == firstFound) { break; } } } private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain } ================================================ FILE: src/main/java/org/jsoup/parser/package-info.java ================================================ /** Contains the HTML parser, tag specifications, and HTML tokeniser. */ @NullMarked package org.jsoup.parser; import org.jspecify.annotations.NullMarked; ================================================ FILE: src/main/java/org/jsoup/safety/Cleaner.java ================================================ package org.jsoup.safety; import org.jsoup.helper.Validate; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.DataNode; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.parser.ParseErrorList; import org.jsoup.parser.Parser; import org.jsoup.select.NodeVisitor; import java.util.List; import static org.jsoup.internal.SharedConstants.DummyUri; /** The {@link Safelist}-based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes that you are expecting; no junk, and no cross-site scripting attacks!

The HTML cleaner parses the input as HTML and then runs it through a safelist, so the output HTML can only contain HTML that is allowed by the safelist.

It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the canned safelists only allow body-contained tags.

Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}.

A Cleaner may be reused across multiple documents and shared across concurrent threads once its {@link Safelist} has been configured. The cleaner uses the supplied safelist directly, so later safelist changes affect later cleaning calls. If you need a variant of an existing configuration, use {@link Safelist#Safelist(Safelist)} to make a copy.

*/ public class Cleaner { private final Safelist safelist; /** Create a new cleaner, that sanitizes documents using the supplied safelist. @param safelist safe-list to clean with */ public Cleaner(Safelist safelist) { Validate.notNull(safelist); this.safelist = safelist; } /** Creates a new, clean document, from the original dirty document, containing only elements allowed by the safelist. The original document is not modified. Only elements from the dirty document's body are used. The OutputSettings of the original document are cloned into the clean document. @param dirtyDocument Untrusted base document to clean. @return cleaned document. */ public Document clean(Document dirtyDocument) { Validate.notNull(dirtyDocument); Document clean = Document.createShell(dirtyDocument.baseUri()); copySafeNodes(dirtyDocument.body(), clean.body()); clean.outputSettings(dirtyDocument.outputSettings().clone()); return clean; } /** Determines if the input document's body is valid, against the safelist. It is considered valid if all the tags and attributes in the input HTML are allowed by the safelist, and that there is no content in the head.

This method is intended to be used in a user interface as a validator for user input. Note that regardless of the output of this method, the input document must always be normalized using a method such as {@link #clean(Document)}, and the result of that method used to store or serialize the document before later reuse such as presentation to end users. This ensures that enforced attributes are set correctly, and that any differences between how a given browser and how jsoup parses the input HTML are normalized.

Example:

{@code
     Document inputDoc = Jsoup.parse(inputHtml);
     Cleaner cleaner = new Cleaner(Safelist.relaxed());
     boolean isValid = cleaner.isValid(inputDoc);
     Document normalizedDoc = cleaner.clean(inputDoc);
     }

@param dirtyDocument document to test @return true if no tags or attributes need to be removed; false if they do */ public boolean isValid(Document dirtyDocument) { Validate.notNull(dirtyDocument); Document clean = Document.createShell(dirtyDocument.baseUri()); int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body()); return numDiscarded == 0 && dirtyDocument.head().childNodes().isEmpty(); // because we only look at the body, but we start from a shell, make sure there's nothing in the head } /** Determines if the input document's body HTML is valid, against the safelist. It is considered valid if all the tags and attributes in the input HTML are allowed by the safelist.

This method is intended to be used in a user interface as a validator for user input. Note that regardless of the output of this method, the input document must always be normalized using a method such as {@link #clean(Document)}, and the result of that method used to store or serialize the document before later reuse such as presentation to end users. This ensures that enforced attributes are set correctly, and that any differences between how a given browser and how jsoup parses the input HTML are normalized.

Example:

{@code
     Document inputDoc = Jsoup.parse(inputHtml);
     Cleaner cleaner = new Cleaner(Safelist.relaxed());
     boolean isValid = cleaner.isValidBodyHtml(inputHtml);
     Document normalizedDoc = cleaner.clean(inputDoc);
     }

@param bodyHtml HTML fragment to test @return true if no tags or attributes need to be removed; false if they do */ public boolean isValidBodyHtml(String bodyHtml) { String baseUri = (safelist.preserveRelativeLinks()) ? DummyUri : ""; // fake base URI to allow relative URLs to remain valid Document clean = Document.createShell(baseUri); Document dirty = Document.createShell(baseUri); ParseErrorList errorList = ParseErrorList.tracking(1); List nodes = Parser.parseFragment(bodyHtml, dirty.body(), baseUri, errorList); dirty.body().insertChildren(0, nodes); int numDiscarded = copySafeNodes(dirty.body(), clean.body()); return numDiscarded == 0 && errorList.isEmpty(); } /** Iterates the input and copies trusted nodes (tags, attributes, text) into the destination. */ private final class CleaningVisitor implements NodeVisitor { private int numDiscarded = 0; private final Element root; private Element destination; // current element to append nodes to private CleaningVisitor(Element root, Element destination) { this.root = root; this.destination = destination; } @Override public void head(Node source, int depth) { if (source instanceof Element) { Element sourceEl = (Element) source; if (safelist.isSafeTag(sourceEl.normalName())) { // safe, clone and copy safe attrs ElementMeta meta = createSafeElement(sourceEl); Element destChild = meta.el; destination.appendChild(destChild); numDiscarded += meta.numAttribsDiscarded; destination = destChild; } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. numDiscarded++; } } else if (source instanceof TextNode) { TextNode sourceText = (TextNode) source; TextNode destText = new TextNode(sourceText.getWholeText()); destination.appendChild(destText); } else if (source instanceof DataNode && safelist.isSafeTag(source.parent().normalName())) { DataNode sourceData = (DataNode) source; DataNode destData = new DataNode(sourceData.getWholeData()); destination.appendChild(destData); } else { // else, we don't care about comments, xml proc instructions, etc numDiscarded++; } } @Override public void tail(Node source, int depth) { if (source instanceof Element && safelist.isSafeTag(source.normalName())) { destination = destination.parent(); // would have descended, so pop destination stack } } } private int copySafeNodes(Element source, Element dest) { CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest); cleaningVisitor.traverse(source); return cleaningVisitor.numDiscarded; } private ElementMeta createSafeElement(Element sourceEl) { Element dest = sourceEl.shallowClone(); // reuses tag, clones attributes and preserves any user data String sourceTag = sourceEl.tagName(); Attributes destAttrs = dest.attributes(); dest.clearAttributes(); // clear all non-internal attributes, ready for safe copy int numDiscarded = 0; Attributes sourceAttrs = sourceEl.attributes(); for (Attribute sourceAttr : sourceAttrs) { if (safelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) { // will keep this attr String key = sourceAttr.getKey(); String value = sourceAttr.getValue(); if (safelist.shouldAbsUrl(sourceTag, key)) { // configured to make absolute urls for this key (href) value = sourceEl.absUrl(key); if (value.isEmpty()) // could not be made abs; leave as-is to allow custom unknown protocols value = sourceAttr.getValue(); } destAttrs.put(key, value); } else numDiscarded++; } Attributes enforcedAttrs = safelist.getEnforcedAttributes(sourceTag); // special case for , only apply to external links: if (sourceEl.nameIs("a") && enforcedAttrs.get("rel").equals("nofollow")) { String href = sourceEl.absUrl("href"); String sourceBase = sourceEl.baseUri(); if (!href.isEmpty() && !sourceBase.isEmpty() && href.startsWith(sourceBase)) { // same site, so don't set the nofollow enforcedAttrs.remove("rel"); } } // apply enforced attributes case-insensitively, so a preserved-case source attr is canonicalized to the enforced key for (Attribute enforcedAttr : enforcedAttrs) { destAttrs.removeIgnoreCase(enforcedAttr.getKey()); destAttrs.put(enforcedAttr.getKey(), enforcedAttr.getValue()); } dest.attributes().addAll(destAttrs); // re-attach, if removed in clear return new ElementMeta(dest, numDiscarded); } private static class ElementMeta { Element el; int numAttribsDiscarded; ElementMeta(Element el, int numAttribsDiscarded) { this.el = el; this.numAttribsDiscarded = numAttribsDiscarded; } } } ================================================ FILE: src/main/java/org/jsoup/safety/Safelist.java ================================================ package org.jsoup.safety; /* Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/rgrove/sanitize/, which inspired this safe-list configuration, and the initial defaults. */ import org.jsoup.helper.Validate; import org.jsoup.internal.Normalizer; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Element; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Objects; import java.util.Set; import static org.jsoup.internal.Normalizer.lowerCase; /** Safelists define what HTML (elements and attributes) to allow through a {@link Cleaner}. Everything else is removed.

Start with one of the defaults:

  • {@link #none}
  • {@link #simpleText}
  • {@link #basic}
  • {@link #basicWithImages}
  • {@link #relaxed}

If you need to allow more through (please be careful!), tweak a base safelist with:

  • {@link #addTags(String... tagNames)}
  • {@link #addAttributes(String tagName, String... attributes)}
  • {@link #addEnforcedAttribute(String tagName, String attribute, String value)}
  • {@link #addProtocols(String tagName, String attribute, String... protocols)}

You can remove any setting from an existing safelist with:

  • {@link #removeTags(String... tagNames)}
  • {@link #removeAttributes(String tagName, String... attributes)}
  • {@link #removeEnforcedAttribute(String tagName, String attribute)}
  • {@link #removeProtocols(String tagName, String attribute, String... removeProtocols)}

The {@link Cleaner} and these safelists assume that you want to clean a body fragment of HTML (to add user supplied HTML into a templated page), and not to clean a full HTML document. If the latter is the case, you could wrap the templated document HTML around the cleaned body HTML.

Safelists are mutable. A {@link Cleaner} uses the supplied safelist directly, so later changes affect later cleaning calls. If you want to share a safelist across threads, finish configuring it first and do not mutate it while it is in use. To build a variant from an existing configuration, use {@link #Safelist(Safelist)} to make a copy.

If you are going to extend a safelist, please be very careful. Make sure you understand what attributes may lead to XSS attack vectors. URL attributes are particularly vulnerable and require careful validation. See the XSS Filter Evasion Cheat Sheet for some XSS attack examples (that jsoup will safeguard against with the default Cleaner and Safelist configuration).

*/ public class Safelist { private static final String All = ":all"; private static final TagName AllTag = TagName.valueOf(All); private final Set tagNames; // tags allowed, lower case. e.g. [p, br, span] private final Map> attributes; // tag -> attribute[]. allowed attributes [href] for a tag. private final Map> enforcedAttributes; // always set these attribute values private final Map>> protocols; // allowed URL protocols for attributes private boolean preserveRelativeLinks; // option to preserve relative links /** This safelist allows only text nodes: any HTML Element or any Node other than a TextNode will be removed.

Note that the output of {@link org.jsoup.Jsoup#clean(String, Safelist)} is still HTML even when using this Safelist, and so any HTML entities in the output will be appropriately escaped. If you want plain text, not HTML, you should use a text method such as {@link Element#text()} instead, after cleaning the document.

Example:

{@code
     String sourceBodyHtml = "

5 is < 6.

"; String html = Jsoup.clean(sourceBodyHtml, Safelist.none()); Cleaner cleaner = new Cleaner(Safelist.none()); String text = cleaner.clean(Jsoup.parse(sourceBodyHtml)).text(); // html is: 5 is < 6. // text is: 5 is < 6. }
@return safelist */ public static Safelist none() { return new Safelist(); } /** This safelist allows only simple text formatting: b, em, i, strong, u. All other HTML (tags and attributes) will be removed. @return safelist */ public static Safelist simpleText() { return new Safelist() .addTags("b", "em", "i", "strong", "u") ; } /**

This safelist allows a fuller range of text nodes: a, b, blockquote, br, cite, code, dd, dl, dt, em, i, li, ol, p, pre, q, small, span, strike, strong, sub, sup, u, ul, and appropriate attributes.

Links (a elements) can point to http, https, ftp, mailto, and have an enforced rel=nofollow attribute if they link offsite (as indicated by the specified base URI).

Does not allow images.

@return safelist */ public static Safelist basic() { return new Safelist() .addTags( "a", "b", "blockquote", "br", "cite", "code", "dd", "dl", "dt", "em", "i", "li", "ol", "p", "pre", "q", "small", "span", "strike", "strong", "sub", "sup", "u", "ul") .addAttributes("a", "href") .addAttributes("blockquote", "cite") .addAttributes("q", "cite") .addProtocols("a", "href", "ftp", "http", "https", "mailto") .addProtocols("blockquote", "cite", "http", "https") .addProtocols("cite", "cite", "http", "https") .addEnforcedAttribute("a", "rel", "nofollow") // has special handling for external links, in Cleaner ; } /** This safelist allows the same text tags as {@link #basic}, and also allows img tags, with appropriate attributes, with src pointing to http or https. @return safelist */ public static Safelist basicWithImages() { return basic() .addTags("img") .addAttributes("img", "align", "alt", "height", "src", "title", "width") .addProtocols("img", "src", "http", "https") ; } /** This safelist allows a full range of text and structural body HTML: a, b, blockquote, br, caption, cite, code, col, colgroup, dd, div, dl, dt, em, h1, h2, h3, h4, h5, h6, i, img, li, ol, p, pre, q, small, span, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, u, ul

Links do not have an enforced rel=nofollow attribute, but you can add that if desired.

@return safelist */ public static Safelist relaxed() { return new Safelist() .addTags( "a", "b", "blockquote", "br", "caption", "cite", "code", "col", "colgroup", "dd", "div", "dl", "dt", "em", "h1", "h2", "h3", "h4", "h5", "h6", "i", "img", "li", "ol", "p", "pre", "q", "small", "span", "strike", "strong", "sub", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "u", "ul") .addAttributes("a", "href", "title") .addAttributes("blockquote", "cite") .addAttributes("col", "span", "width") .addAttributes("colgroup", "span", "width") .addAttributes("img", "align", "alt", "height", "src", "title", "width") .addAttributes("ol", "start", "type") .addAttributes("q", "cite") .addAttributes("table", "summary", "width") .addAttributes("td", "abbr", "axis", "colspan", "rowspan", "width") .addAttributes( "th", "abbr", "axis", "colspan", "rowspan", "scope", "width") .addAttributes("ul", "type") .addProtocols("a", "href", "ftp", "http", "https", "mailto") .addProtocols("blockquote", "cite", "http", "https") .addProtocols("cite", "cite", "http", "https") .addProtocols("img", "src", "http", "https") .addProtocols("q", "cite", "http", "https") ; } /** Create a new, empty safelist. Generally it will be better to start with a default prepared safelist instead. @see #basic() @see #basicWithImages() @see #simpleText() @see #relaxed() */ public Safelist() { tagNames = new HashSet<>(); attributes = new HashMap<>(); enforcedAttributes = new HashMap<>(); protocols = new HashMap<>(); preserveRelativeLinks = false; } /** Deep copy an existing Safelist to a new Safelist. @param copy the Safelist to copy */ public Safelist(Safelist copy) { this(); tagNames.addAll(copy.tagNames); for (Map.Entry> copyTagAttributes : copy.attributes.entrySet()) { attributes.put(copyTagAttributes.getKey(), new HashSet<>(copyTagAttributes.getValue())); } for (Map.Entry> enforcedEntry : copy.enforcedAttributes.entrySet()) { enforcedAttributes.put(enforcedEntry.getKey(), new HashMap<>(enforcedEntry.getValue())); } for (Map.Entry>> protocolsEntry : copy.protocols.entrySet()) { Map> attributeProtocolsCopy = new HashMap<>(); for (Map.Entry> attributeProtocols : protocolsEntry.getValue().entrySet()) { attributeProtocolsCopy.put(attributeProtocols.getKey(), new HashSet<>(attributeProtocols.getValue())); } protocols.put(protocolsEntry.getKey(), attributeProtocolsCopy); } preserveRelativeLinks = copy.preserveRelativeLinks; } /** Add a list of allowed elements to a safelist. (If a tag is not allowed, it will be removed from the HTML.) @param tags tag names to allow @return this (for chaining) */ public Safelist addTags(String... tags) { Validate.notNull(tags); for (String tagName : tags) { Validate.notEmpty(tagName); Validate.isFalse(tagName.equalsIgnoreCase("noscript"), "noscript is unsupported in Safelists, due to incompatibilities between parsers with and without script-mode enabled"); tagNames.add(TagName.valueOf(tagName)); } return this; } /** Remove a list of allowed elements from a safelist. (If a tag is not allowed, it will be removed from the HTML.) @param tags tag names to disallow @return this (for chaining) */ public Safelist removeTags(String... tags) { Validate.notNull(tags); for(String tag: tags) { Validate.notEmpty(tag); TagName tagName = TagName.valueOf(tag); if(tagNames.remove(tagName)) { // Only look in sub-maps if tag was allowed attributes.remove(tagName); enforcedAttributes.remove(tagName); protocols.remove(tagName); } } return this; } /** Add a list of allowed attributes to a tag. (If an attribute is not allowed on an element, it will be removed.)

E.g.: addAttributes("a", "href", "class") allows href and class attributes on a tags.

To make an attribute valid for all tags, use the pseudo tag :all, e.g. addAttributes(":all", "class").

@param tag The tag the attributes are for. The tag will be added to the allowed tag list if necessary. @param attributes List of valid attributes for the tag @return this (for chaining) */ public Safelist addAttributes(String tag, String... attributes) { Validate.notEmpty(tag); Validate.notNull(attributes); Validate.isTrue(attributes.length > 0, "No attribute names supplied."); addTags(tag); TagName tagName = TagName.valueOf(tag); Set attributeSet = new HashSet<>(); for (String key : attributes) { Validate.notEmpty(key); attributeSet.add(AttributeKey.valueOf(key)); } Set currentSet = this.attributes.computeIfAbsent(tagName, k -> new HashSet<>()); currentSet.addAll(attributeSet); return this; } /** Remove a list of allowed attributes from a tag. (If an attribute is not allowed on an element, it will be removed.)

E.g.: removeAttributes("a", "href", "class") disallows href and class attributes on a tags.

To make an attribute invalid for all tags, use the pseudo tag :all, e.g. removeAttributes(":all", "class").

@param tag The tag the attributes are for. @param attributes List of invalid attributes for the tag @return this (for chaining) */ public Safelist removeAttributes(String tag, String... attributes) { Validate.notEmpty(tag); Validate.notNull(attributes); Validate.isTrue(attributes.length > 0, "No attribute names supplied."); TagName tagName = TagName.valueOf(tag); Set attributeSet = new HashSet<>(); for (String key : attributes) { Validate.notEmpty(key); attributeSet.add(AttributeKey.valueOf(key)); } if(tagNames.contains(tagName) && this.attributes.containsKey(tagName)) { // Only look in sub-maps if tag was allowed Set currentSet = this.attributes.get(tagName); currentSet.removeAll(attributeSet); if(currentSet.isEmpty()) // Remove tag from attribute map if no attributes are allowed for tag this.attributes.remove(tagName); } if(tag.equals(All)) { // Attribute needs to be removed from all individually set tags Iterator>> it = this.attributes.entrySet().iterator(); while (it.hasNext()) { Map.Entry> entry = it.next(); Set currentSet = entry.getValue(); currentSet.removeAll(attributeSet); if(currentSet.isEmpty()) // Remove tag from attribute map if no attributes are allowed for tag it.remove(); } } return this; } /** Add an enforced attribute to a tag. An enforced attribute will always be added to the element. If the element already has the attribute set, it will be overridden with this value.

E.g.: addEnforcedAttribute("a", "rel", "nofollow") will make all a tags output as <a href="..." rel="nofollow">

@param tag The tag the enforced attribute is for. The tag will be added to the allowed tag list if necessary. @param attribute The attribute name @param value The enforced attribute value @return this (for chaining) */ public Safelist addEnforcedAttribute(String tag, String attribute, String value) { Validate.notEmpty(tag); Validate.notEmpty(attribute); Validate.notEmpty(value); TagName tagName = TagName.valueOf(tag); tagNames.add(tagName); AttributeKey attrKey = AttributeKey.valueOf(attribute); AttributeValue attrVal = AttributeValue.valueOf(value); Map attrMap = enforcedAttributes.computeIfAbsent(tagName, k -> new HashMap<>()); attrMap.put(attrKey, attrVal); return this; } /** Remove a previously configured enforced attribute from a tag. @param tag The tag the enforced attribute is for. @param attribute The attribute name @return this (for chaining) */ public Safelist removeEnforcedAttribute(String tag, String attribute) { Validate.notEmpty(tag); Validate.notEmpty(attribute); TagName tagName = TagName.valueOf(tag); if(tagNames.contains(tagName) && enforcedAttributes.containsKey(tagName)) { AttributeKey attrKey = AttributeKey.valueOf(attribute); Map attrMap = enforcedAttributes.get(tagName); attrMap.remove(attrKey); if(attrMap.isEmpty()) // Remove tag from enforced attribute map if no enforced attributes are present enforcedAttributes.remove(tagName); } return this; } /** * Configure this Safelist to preserve relative links in an element's URL attribute, or convert them to absolute * links. By default, this is false: URLs will be made absolute (e.g. start with an allowed protocol, like * e.g. {@code http://}. * * @param preserve {@code true} to allow relative links, {@code false} (default) to deny * @return this Safelist, for chaining. * @see #addProtocols */ public Safelist preserveRelativeLinks(boolean preserve) { preserveRelativeLinks = preserve; return this; } /** * Get the current setting for preserving relative links. * @return {@code true} if relative links are preserved, {@code false} if they are converted to absolute. */ public boolean preserveRelativeLinks() { return preserveRelativeLinks; } /** Add allowed URL protocols for an element's URL attribute. This restricts the possible values of the attribute to URLs with the defined protocol.

E.g.: addProtocols("a", "href", "ftp", "http", "https")

To allow a link to an in-page URL anchor (i.e. <a href="#anchor">, add a #:
E.g.: addProtocols("a", "href", "#")

@param tag Tag the URL protocol is for @param attribute Attribute name @param protocols List of valid protocols @return this, for chaining */ public Safelist addProtocols(String tag, String attribute, String... protocols) { Validate.notEmpty(tag); Validate.notEmpty(attribute); Validate.notNull(protocols); TagName tagName = TagName.valueOf(tag); AttributeKey attrKey = AttributeKey.valueOf(attribute); Map> attrMap = this.protocols.computeIfAbsent(tagName, k -> new HashMap<>()); Set protSet = attrMap.computeIfAbsent(attrKey, k -> new HashSet<>()); for (String protocol : protocols) { Validate.notEmpty(protocol); Protocol prot = Protocol.valueOf(protocol); protSet.add(prot); } return this; } /** Remove allowed URL protocols for an element's URL attribute. If you remove all protocols for an attribute, that attribute will allow any protocol.

E.g.: removeProtocols("a", "href", "ftp")

@param tag Tag the URL protocol is for @param attribute Attribute name @param removeProtocols List of invalid protocols @return this, for chaining */ public Safelist removeProtocols(String tag, String attribute, String... removeProtocols) { Validate.notEmpty(tag); Validate.notEmpty(attribute); Validate.notNull(removeProtocols); TagName tagName = TagName.valueOf(tag); AttributeKey attr = AttributeKey.valueOf(attribute); // make sure that what we're removing actually exists; otherwise can open the tag to any data and that can // be surprising Validate.isTrue(protocols.containsKey(tagName), "Cannot remove a protocol that is not set."); Map> tagProtocols = protocols.get(tagName); Validate.isTrue(tagProtocols.containsKey(attr), "Cannot remove a protocol that is not set."); Set attrProtocols = tagProtocols.get(attr); for (String protocol : removeProtocols) { Validate.notEmpty(protocol); attrProtocols.remove(Protocol.valueOf(protocol)); } if (attrProtocols.isEmpty()) { // Remove protocol set if empty tagProtocols.remove(attr); if (tagProtocols.isEmpty()) // Remove entry for tag if empty protocols.remove(tagName); } return this; } /** * Test if the supplied tag is allowed by this safelist. * @param tag test tag * @return true if allowed */ public boolean isSafeTag(String tag) { return tagNames.contains(TagName.valueOf(tag)); } /** * Test if the supplied attribute is allowed by this safelist for this tag. *

This method does not modify the input element or attribute.

* @param tagName tag to consider allowing the attribute in * @param el element under test, to confirm protocol * @param attr attribute under test * @return true if allowed */ public boolean isSafeAttribute(String tagName, Element el, Attribute attr) { TagName tag = TagName.valueOf(tagName); AttributeKey key = AttributeKey.valueOf(attr.getKey()); Set okSet = attributes.get(tag); if (okSet != null && okSet.contains(key)) { if (protocols.containsKey(tag)) { Map> attrProts = protocols.get(tag); // ok if not defined protocol; otherwise test return !attrProts.containsKey(key) || isSafeProtocol(getProtocolValue(el, attr), attrProts.get(key)); } else { // attribute found, no protocols defined, so OK return true; } } Map enforcedSet = enforcedAttributes.get(tag); if (enforcedSet != null && enforcedSet.containsKey(key)) { // enforced attr key was LCed via AttributeKey.valueOf(attr.getKey()), // if the input already has that exact value, treat it as safe return enforcedSet.get(key).equals(AttributeValue.valueOf(attr.getValue())); } // no attributes defined for tag, try :all tag return !tagName.equals(All) && isSafeAttribute(All, el, attr); } private String getProtocolValue(Element el, Attribute attr) { String value = el.absUrl(attr.getKey()); if (value.isEmpty()) value = attr.getValue(); // if it could not be made abs, run as-is to allow custom unknown protocols return value; } private boolean isSafeProtocol(String value, Set protocols) { for (Protocol protocol : protocols) { String prot = protocol.toString(); if (prot.equals("#")) { // allows anchor links if (isValidAnchor(value)) { return true; } else { continue; } } prot += ":"; if (lowerCase(value).startsWith(prot)) { return true; } } return false; } /** Check if a URL attribute should be normalized to an absolute URL in the cleaned output. Uses the configured protocols for that tag+attribute pair, falling back to {@code :all} only if the tag does not define the attribute. */ boolean shouldAbsUrl(String tagName, String attrKey) { if (preserveRelativeLinks) return false; return shouldAbsUrl(TagName.valueOf(tagName), AttributeKey.valueOf(attrKey)); } private boolean shouldAbsUrl(TagName tag, AttributeKey key) { Set allowedAttrs = attributes.get(tag); if (allowedAttrs != null && allowedAttrs.contains(key)) { Map> protocolsByAttr = protocols.get(tag); return protocolsByAttr != null && protocolsByAttr.containsKey(key); } Map enforcedAttrs = enforcedAttributes.get(tag); if (enforcedAttrs != null && enforcedAttrs.containsKey(key)) return false; return !tag.equals(AllTag) && shouldAbsUrl(AllTag, key); } private static boolean isValidAnchor(String value) { return value.startsWith("#") && !value.matches(".*\\s.*"); } /** Gets the Attributes that should be enforced for a given tag * @param tagName the tag * @return the attributes that will be enforced; empty if none are set for the given tag */ public Attributes getEnforcedAttributes(String tagName) { Attributes attrs = new Attributes(); TagName tag = TagName.valueOf(tagName); if (enforcedAttributes.containsKey(tag)) { Map keyVals = enforcedAttributes.get(tag); for (Map.Entry entry : keyVals.entrySet()) { attrs.put(entry.getKey().toString(), entry.getValue().toString()); } } return attrs; } // named types for config. All just hold strings, but here for my sanity. static class TagName extends TypedValue { TagName(String value) { super(value); } static TagName valueOf(String value) { return new TagName(Normalizer.lowerCase(value)); } } static class AttributeKey extends TypedValue { AttributeKey(String value) { super(value); } static AttributeKey valueOf(String value) { return new AttributeKey(Normalizer.lowerCase(value)); } } static class AttributeValue extends TypedValue { AttributeValue(String value) { super(value); } static AttributeValue valueOf(String value) { return new AttributeValue(value); } } static class Protocol extends TypedValue { Protocol(String value) { super(value); } static Protocol valueOf(String value) { return new Protocol(value); } } abstract static class TypedValue { private final String value; TypedValue(String value) { Validate.notNull(value); this.value = value; } @Override public int hashCode() { return value.hashCode(); } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null || getClass() != obj.getClass()) return false; TypedValue other = (TypedValue) obj; return Objects.equals(value, other.value); } @Override public String toString() { return value; } } } ================================================ FILE: src/main/java/org/jsoup/safety/package-info.java ================================================ /** Contains the jsoup HTML cleaner, and safelist definitions. */ @NullMarked package org.jsoup.safety; import org.jspecify.annotations.NullMarked; ================================================ FILE: src/main/java/org/jsoup/select/Collector.java ================================================ package org.jsoup.select; import org.jsoup.helper.Validate; import org.jsoup.nodes.Element; import org.jsoup.nodes.LeafNode; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jspecify.annotations.Nullable; import java.util.stream.Collectors; import java.util.stream.Stream; import static java.util.stream.Collectors.toCollection; /** * Collects a list of elements that match the supplied criteria. * * @author Jonathan Hedley */ public class Collector { private Collector() {} /** Build a list of elements, by visiting the root and every descendant of root, and testing it against the Evaluator. @param eval Evaluator to test elements against @param root root of tree to descend @return list of matches; empty if none */ public static Elements collect(Evaluator eval, Element root) { Stream stream = eval.wantsNodes() ? streamNodes(eval, root, Element.class) : stream(eval, root); Elements els = stream.collect(toCollection(Elements::new)); eval.reset(); // drops any held memos return els; } /** Obtain a Stream of elements by visiting the root and every descendant of root and testing it against the evaluator. @param evaluator Evaluator to test elements against @param root root of tree to descend @return A {@link Stream} of matches @since 1.19.1 */ public static Stream stream(Evaluator evaluator, Element root) { evaluator.reset(); return root.stream().filter(evaluator.asPredicate(root)); } /** Obtain a Stream of nodes, of the specified type, by visiting the root and every descendant of root and testing it against the evaluator. @param evaluator Evaluator to test elements against @param root root of tree to descend @param type the type of node to collect (e.g. {@link Element}, {@link LeafNode}, {@link TextNode} etc) @param the type of node to collect @return A {@link Stream} of matches @since 1.21.1 */ public static Stream streamNodes(Evaluator evaluator, Element root, Class type) { evaluator.reset(); return root.nodeStream(type).filter(evaluator.asNodePredicate(root)); } /** Finds the first Element that matches the Evaluator that descends from the root, and stops the query once that first match is found. @param eval Evaluator to test elements against @param root root of tree to descend @return the first match; {@code null} if none */ public static @Nullable Element findFirst(Evaluator eval, Element root) { Element el = stream(eval, root).findFirst().orElse(null); eval.reset(); return el; } /** Finds the first Node that matches the Evaluator that descends from the root, and stops the query once that first match is found. @param eval Evaluator to test elements against @param root root of tree to descend @param type the type of node to collect (e.g. {@link Element}, {@link LeafNode}, {@link TextNode} etc) @return the first match; {@code null} if none @since 1.21.1 */ public static @Nullable T findFirstNode(Evaluator eval, Element root, Class type) { T node = streamNodes(eval, root, type).findFirst().orElse(null); eval.reset(); return node; } /** Build a list of nodes that match the supplied criteria, by visiting the root and every descendant of root, and testing it against the Evaluator. @param evaluator Evaluator to test elements against @param root root of tree to descend @param type the type of node to collect (e.g. {@link Element}, {@link LeafNode}, {@link TextNode} etc) @param the type of node to collect @return list of matches; empty if none */ public static Nodes collectNodes(Evaluator evaluator, Element root, Class type) { return streamNodes(evaluator, root, type).collect(toCollection(Nodes::new)); } } ================================================ FILE: src/main/java/org/jsoup/select/CombiningEvaluator.java ================================================ package org.jsoup.select; import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Element; import org.jsoup.nodes.LeafNode; import org.jsoup.nodes.Node; import org.jspecify.annotations.Nullable; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Comparator; import java.util.List; /** * Base combining (and, or) evaluator. */ public abstract class CombiningEvaluator extends Evaluator { final ArrayList evaluators; // maintain original order so that #toString() is sensible final List sortedEvaluators; // cost ascending order int num = 0; int cost = 0; boolean wantsNodes; CombiningEvaluator() { super(); evaluators = new ArrayList<>(); sortedEvaluators = new ArrayList<>(); } CombiningEvaluator(Collection evaluators) { this(); this.evaluators.addAll(evaluators); updateEvaluators(); } public void add(Evaluator e) { evaluators.add(e); updateEvaluators(); } @Override protected void reset() { for (Evaluator evaluator : evaluators) { evaluator.reset(); } super.reset(); } @Override protected int cost() { return cost; } @Override boolean wantsNodes() { return wantsNodes; } void updateEvaluators() { // used so we don't need to bash on size() for every match test num = evaluators.size(); // sort the evaluators by lowest cost first, to optimize the evaluation order cost = 0; for (Evaluator evaluator : evaluators) { cost += evaluator.cost(); } sortedEvaluators.clear(); sortedEvaluators.addAll(evaluators); sortedEvaluators.sort(Comparator.comparingInt(Evaluator::cost)); // any want nodes? for (Evaluator evaluator : evaluators) { if (evaluator.wantsNodes()) { wantsNodes = true; break; } } } public static final class And extends CombiningEvaluator { public And(Collection evaluators) { super(evaluators); } And(Evaluator... evaluators) { this(Arrays.asList(evaluators)); } @Override public boolean matches(Element root, Element el) { for (int i = 0; i < num; i++) { Evaluator eval = sortedEvaluators.get(i); if (!eval.matches(root, el)) return false; } return true; } @Override public boolean matches(Element root, LeafNode leaf) { for (int i = 0; i < num; i++) { Evaluator eval = sortedEvaluators.get(i); if (!eval.matches(root, leaf)) return false; } return true; } @Override public String toString() { return StringUtil.join(evaluators, ""); } } public static final class Or extends CombiningEvaluator { /** * Create a new Or evaluator. The initial evaluators are ANDed together and used as the first clause of the OR. * @param evaluators initial OR clause (these are wrapped into an AND evaluator). */ public Or(Collection evaluators) { super(); if (num > 1) this.evaluators.add(new And(evaluators)); else // 0 or 1 this.evaluators.addAll(evaluators); updateEvaluators(); } Or(Evaluator... evaluators) { this(Arrays.asList(evaluators)); } Or() { super(); } @Override public boolean matches(Element root, Element element) { for (int i = 0; i < num; i++) { Evaluator eval = sortedEvaluators.get(i); if (eval.matches(root, element)) return true; } return false; } @Override public boolean matches(Element root, LeafNode leaf) { for (int i = 0; i < num; i++) { Evaluator eval = sortedEvaluators.get(i); if (eval.matches(root, leaf)) return true; } return false; } @Override public String toString() { return StringUtil.join(evaluators, ", "); } } } ================================================ FILE: src/main/java/org/jsoup/select/Elements.java ================================================ package org.jsoup.select; import org.jsoup.helper.Validate; import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Comment; import org.jsoup.nodes.DataNode; import org.jsoup.nodes.Element; import org.jsoup.nodes.FormElement; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jspecify.annotations.Nullable; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.function.Predicate; import java.util.function.UnaryOperator; /** A list of {@link Element}s, with methods that act on every element in the list.

To get an {@code Elements} object, use the {@link Element#select(String)} method.

Methods that {@link #set(int, Element) set}, {@link #remove(int) remove}, or {@link #replaceAll(UnaryOperator) replace} Elements in the list will also act on the underlying {@link org.jsoup.nodes.Document DOM}.

@author Jonathan Hedley, jonathan@hedley.net */ public class Elements extends Nodes { public Elements() { } public Elements(int initialCapacity) { super(initialCapacity); } public Elements(Collection elements) { super(elements); } public Elements(List elements) { super(elements); } public Elements(Element... elements) { super(Arrays.asList(elements)); } /** * Creates a deep copy of these elements. * @return a deep copy */ @Override public Elements clone() { Elements clone = new Elements(size()); for (Element e : this) clone.add(e.clone()); return clone; } /** Convenience method to get the Elements as a plain ArrayList. This allows modification to the list of elements without modifying the source Document. I.e. whereas calling {@code elements.remove(0)} will remove the element from both the Elements and the DOM, {@code elements.asList().remove(0)} will remove the element from the list only.

Each Element is still the same DOM connected Element.

@return a new ArrayList containing the elements in this list @since 1.19.2 @see #Elements(List) */ @Override public ArrayList asList() { return new ArrayList<>(this); } // attribute methods /** Get an attribute value from the first matched element that has the attribute. @param attributeKey The attribute key. @return The attribute value from the first matched element that has the attribute. If no elements were matched (isEmpty() == true), or if the no elements have the attribute, returns empty string. @see #hasAttr(String) */ public String attr(String attributeKey) { for (Element element : this) { if (element.hasAttr(attributeKey)) return element.attr(attributeKey); } return ""; } /** Checks if any of the matched elements have this attribute defined. @param attributeKey attribute key @return true if any of the elements have the attribute; false if none do. */ public boolean hasAttr(String attributeKey) { for (Element element : this) { if (element.hasAttr(attributeKey)) return true; } return false; } /** * Get the attribute value for each of the matched elements. If an element does not have this attribute, no value is * included in the result set for that element. * @param attributeKey the attribute name to return values for. You can add the {@code abs:} prefix to the key to * get absolute URLs from relative URLs, e.g.: {@code doc.select("a").eachAttr("abs:href")} . * @return a list of each element's attribute value for the attribute */ public List eachAttr(String attributeKey) { List attrs = new ArrayList<>(size()); for (Element element : this) { if (element.hasAttr(attributeKey)) attrs.add(element.attr(attributeKey)); } return attrs; } /** * Set an attribute on all matched elements. * @param attributeKey attribute key * @param attributeValue attribute value * @return this */ public Elements attr(String attributeKey, String attributeValue) { for (Element element : this) { element.attr(attributeKey, attributeValue); } return this; } /** * Remove an attribute from every matched element. * @param attributeKey The attribute to remove. * @return this (for chaining) */ public Elements removeAttr(String attributeKey) { for (Element element : this) { element.removeAttr(attributeKey); } return this; } /** Add the class name to every matched element's {@code class} attribute. @param className class name to add @return this */ public Elements addClass(String className) { for (Element element : this) { element.addClass(className); } return this; } /** Remove the class name from every matched element's {@code class} attribute, if present. @param className class name to remove @return this */ public Elements removeClass(String className) { for (Element element : this) { element.removeClass(className); } return this; } /** Toggle the class name on every matched element's {@code class} attribute. @param className class name to add if missing, or remove if present, from every element. @return this */ public Elements toggleClass(String className) { for (Element element : this) { element.toggleClass(className); } return this; } /** Determine if any of the matched elements have this class name set in their {@code class} attribute. @param className class name to check for @return true if any do, false if none do */ public boolean hasClass(String className) { for (Element element : this) { if (element.hasClass(className)) return true; } return false; } /** * Get the form element's value of the first matched element. * @return The form element's value, or empty if not set. * @see Element#val() */ public String val() { if (size() > 0) //noinspection ConstantConditions return first().val(); // first() != null as size() > 0 else return ""; } /** * Set the form element's value in each of the matched elements. * @param value The value to set into each matched element * @return this (for chaining) */ public Elements val(String value) { for (Element element : this) element.val(value); return this; } /** * Get the combined text of all the matched elements. *

* Note that it is possible to get repeats if the matched elements contain both parent elements and their own * children, as the Element.text() method returns the combined text of a parent and all its children. * @return string of all text: unescaped and no HTML. * @see Element#text() * @see #eachText() */ public String text() { return stream() .map(Element::text) .collect(StringUtil.joining(" ")); } /** Test if any matched Element has any text content, that is not just whitespace. @return true if any element has non-blank text content. @see Element#hasText() */ public boolean hasText() { for (Element element: this) { if (element.hasText()) return true; } return false; } /** * Get the text content of each of the matched elements. If an element has no text, then it is not included in the * result. * @return A list of each matched element's text content. * @see Element#text() * @see Element#hasText() * @see #text() */ public List eachText() { ArrayList texts = new ArrayList<>(size()); for (Element el: this) { if (el.hasText()) texts.add(el.text()); } return texts; } /** * Get the combined inner HTML of all matched elements. * @return string of all element's inner HTML. * @see #text() * @see #outerHtml() */ public String html() { return stream() .map(Element::html) .collect(StringUtil.joining("\n")); } /** * Update (rename) the tag name of each matched element. For example, to change each {@code } to a {@code }, do * {@code doc.select("i").tagName("em");} * * @param tagName the new tag name * @return this, for chaining * @see Element#tagName(String) */ public Elements tagName(String tagName) { for (Element element : this) { element.tagName(tagName); } return this; } /** * Set the inner HTML of each matched element. * @param html HTML to parse and set into each matched element. * @return this, for chaining * @see Element#html(String) */ public Elements html(String html) { for (Element element : this) { element.html(html); } return this; } /** * Add the supplied HTML to the start of each matched element's inner HTML. * @param html HTML to add inside each element, before the existing HTML * @return this, for chaining * @see Element#prepend(String) */ public Elements prepend(String html) { for (Element element : this) { element.prepend(html); } return this; } /** * Add the supplied HTML to the end of each matched element's inner HTML. * @param html HTML to add inside each element, after the existing HTML * @return this, for chaining * @see Element#append(String) */ public Elements append(String html) { for (Element element : this) { element.append(html); } return this; } /** Insert the supplied HTML before each matched element's outer HTML. @param html HTML to insert before each element @return this, for chaining @see Element#before(String) */ @Override public Elements before(String html) { super.before(html); return this; } /** Insert the supplied HTML after each matched element's outer HTML. @param html HTML to insert after each element @return this, for chaining @see Element#after(String) */ @Override public Elements after(String html) { super.after(html); return this; } /** Wrap the supplied HTML around each matched elements. For example, with HTML {@code

This is Jsoup

}, doc.select("b").wrap("<i></i>"); becomes {@code

This is jsoup

} @param html HTML to wrap around each element, e.g. {@code
}. Can be arbitrarily deep. @return this (for chaining) @see Element#wrap */ @Override public Elements wrap(String html) { super.wrap(html); return this; } /** * Removes the matched elements from the DOM, and moves their children up into their parents. This has the effect of * dropping the elements but keeping their children. *

* This is useful for e.g removing unwanted formatting elements but keeping their contents. *

* * E.g. with HTML:

{@code

One Two
}

*

{@code doc.select("font").unwrap();}

*

HTML = {@code

One Two
}

* * @return this (for chaining) * @see Node#unwrap */ public Elements unwrap() { for (Element element : this) { element.unwrap(); } return this; } /** * Empty (remove all child nodes from) each matched element. This is similar to setting the inner HTML of each * element to nothing. *

* E.g. HTML: {@code

Hello there

now

}
* doc.select("p").empty();
* HTML = {@code

} * @return this, for chaining * @see Element#empty() * @see #remove() */ public Elements empty() { for (Element element : this) { element.empty(); } return this; } /** * Remove each matched element from the DOM. This is similar to setting the outer HTML of each element to nothing. *

The elements will still be retained in this list, in case further processing of them is desired.

*

* E.g. HTML: {@code

Hello

there

}
* doc.select("p").remove();
* HTML = {@code
} *

* Note that this method should not be used to clean user-submitted HTML; rather, use {@link org.jsoup.safety.Cleaner} to clean HTML. * @return this, for chaining * @see Element#empty() * @see #empty() * @see #clear() */ @Override public Elements remove() { super.remove(); return this; } // filters /** * Find matching elements within this element list. * @param query A {@link Selector} query * @return the filtered list of elements, or an empty list if none match. */ public Elements select(String query) { return Selector.select(query, this); } /** Find the first Element that matches the {@link Selector} CSS query within this element list.

This is effectively the same as calling {@code elements.select(query).first()}, but is more efficient as query execution stops on the first hit.

@param cssQuery a {@link Selector} query @return the first matching element, or {@code null} if there is no match. @see #expectFirst(String) @since 1.19.1 */ public @Nullable Element selectFirst(String cssQuery) { return Selector.selectFirst(cssQuery, this); } /** Just like {@link #selectFirst(String)}, but if there is no match, throws an {@link IllegalArgumentException}. @param cssQuery a {@link Selector} query @return the first matching element @throws IllegalArgumentException if no match is found @since 1.19.1 */ public Element expectFirst(String cssQuery) { return Validate.expectNotNull( Selector.selectFirst(cssQuery, this), "No elements matched the query '%s' in the elements.", cssQuery ); } /** * Remove elements from this list that match the {@link Selector} query. *

* E.g. HTML: {@code

Two
}
* Elements divs = doc.select("div").not(".logo");
* Result: {@code divs: [
Two
]} *

* @param query the selector query whose results should be removed from these elements * @return a new elements list that contains only the filtered results */ public Elements not(String query) { Elements out = Selector.select(query, this); return Selector.filterOut(this, out); } /** * Get the nth matched element as an Elements object. *

* See also {@link #get(int)} to retrieve an Element. * @param index the (zero-based) index of the element in the list to retain * @return Elements containing only the specified element, or, if that element did not exist, an empty list. */ public Elements eq(int index) { return size() > index ? new Elements(get(index)) : new Elements(); } /** * Test if any of the matched elements match the supplied query. * @param query A selector * @return true if at least one element in the list matches the query. */ public boolean is(String query) { Evaluator eval = Selector.evaluatorOf(query); for (Element e : this) { if (e.is(eval)) return true; } return false; } /** * Get the immediate next element sibling of each element in this list. * @return next element siblings. */ public Elements next() { return siblings(null, true, false); } /** * Get the immediate next element sibling of each element in this list, filtered by the query. * @param query CSS query to match siblings against * @return next element siblings. */ public Elements next(String query) { return siblings(query, true, false); } /** * Get each of the following element siblings of each element in this list. * @return all following element siblings. */ public Elements nextAll() { return siblings(null, true, true); } /** * Get each of the following element siblings of each element in this list, that match the query. * @param query CSS query to match siblings against * @return all following element siblings. */ public Elements nextAll(String query) { return siblings(query, true, true); } /** * Get the immediate previous element sibling of each element in this list. * @return previous element siblings. */ public Elements prev() { return siblings(null, false, false); } /** * Get the immediate previous element sibling of each element in this list, filtered by the query. * @param query CSS query to match siblings against * @return previous element siblings. */ public Elements prev(String query) { return siblings(query, false, false); } /** * Get each of the previous element siblings of each element in this list. * @return all previous element siblings. */ public Elements prevAll() { return siblings(null, false, true); } /** * Get each of the previous element siblings of each element in this list, that match the query. * @param query CSS query to match siblings against * @return all previous element siblings. */ public Elements prevAll(String query) { return siblings(query, false, true); } private Elements siblings(@Nullable String query, boolean next, boolean all) { Elements els = new Elements(); Evaluator eval = query != null? Selector.evaluatorOf(query) : null; for (Element e : this) { do { Element sib = next ? e.nextElementSibling() : e.previousElementSibling(); if (sib == null) break; if (eval == null || sib.is(eval)) els.add(sib); e = sib; } while (all); } return els; } /** * Get all of the parents and ancestor elements of the matched elements. * @return all of the parents and ancestor elements of the matched elements */ public Elements parents() { HashSet combo = new LinkedHashSet<>(); for (Element e: this) { combo.addAll(e.parents()); } return new Elements(combo); } // list-like methods /** Get the first matched element. @return The first matched element, or null if contents is empty. */ @Override public @Nullable Element first() { return super.first(); } /** Get the last matched element. @return The last matched element, or null if contents is empty. */ @Override public @Nullable Element last() { return super.last(); } /** * Perform a depth-first traversal on each of the selected elements. * @param nodeVisitor the visitor callbacks to perform on each node * @return this, for chaining */ public Elements traverse(NodeVisitor nodeVisitor) { NodeTraversor.traverse(nodeVisitor, this); return this; } /** * Perform a depth-first filtering on each of the selected elements. * @param nodeFilter the filter callbacks to perform on each node * @return this, for chaining */ public Elements filter(NodeFilter nodeFilter) { NodeTraversor.filter(nodeFilter, this); return this; } /** * Get the {@link FormElement} forms from the selected elements, if any. * @return a list of {@link FormElement}s pulled from the matched elements. The list will be empty if the elements contain * no forms. */ public List forms() { ArrayList forms = new ArrayList<>(); for (Element el: this) if (el instanceof FormElement) forms.add((FormElement) el); return forms; } /** * Get {@link Comment} nodes that are direct child nodes of the selected elements. * @return Comment nodes, or an empty list if none. */ public List comments() { return childNodesOfType(Comment.class); } /** * Get {@link TextNode} nodes that are direct child nodes of the selected elements. * @return TextNode nodes, or an empty list if none. */ public List textNodes() { return childNodesOfType(TextNode.class); } /** * Get {@link DataNode} nodes that are direct child nodes of the selected elements. DataNode nodes contain the * content of tags such as {@code script}, {@code style} etc and are distinct from {@link TextNode}s. * @return Comment nodes, or an empty list if none. */ public List dataNodes() { return childNodesOfType(DataNode.class); } private List childNodesOfType(Class tClass) { ArrayList nodes = new ArrayList<>(); for (Element el: this) { for (int i = 0; i < el.childNodeSize(); i++) { Node node = el.childNode(i); if (tClass.isInstance(node)) nodes.add(tClass.cast(node)); } } return nodes; } // list methods that update the DOM: /** Replace the Element at the specified index in this list, and in the DOM. @param index index of the element to replace @param element element to be stored at the specified position @return the old Element at this index @since 1.17.1 */ @Override public Element set(int index, Element element) { return super.set(index, element); } /** Remove the Element at the specified index in this ist, and from the DOM. @param index the index of the element to be removed @return the old element at this index @see #deselect(int) @since 1.17.1 */ @Override public Element remove(int index) { return super.remove(index); } /** Remove the Element at the specified index in this list, but not from the DOM. @param index the index of the element to be removed @return the old element at this index @see #remove(int) @since 1.19.2 */ @Override public Element deselect(int index) { return super.deselect(index); } } ================================================ FILE: src/main/java/org/jsoup/select/Evaluator.java ================================================ package org.jsoup.select; import org.jsoup.helper.Validate; import org.jsoup.nodes.Comment; import org.jsoup.nodes.Document; import org.jsoup.nodes.DocumentType; import org.jsoup.nodes.Element; import org.jsoup.nodes.LeafNode; import org.jsoup.nodes.Node; import org.jsoup.nodes.PseudoTextElement; import org.jsoup.nodes.TextNode; import org.jsoup.nodes.XmlDeclaration; import org.jsoup.parser.ParseSettings; import org.jsoup.helper.Regex; import java.util.List; import java.util.function.Predicate; import java.util.regex.Pattern; import static org.jsoup.internal.Normalizer.lowerCase; import static org.jsoup.internal.Normalizer.normalize; import static org.jsoup.internal.StringUtil.normaliseWhitespace; /** An Evaluator tests if an element (or a node) meets the selector's requirements. Obtain an evaluator for a given CSS selector with {@link Selector#evaluatorOf(String css)}. If you are executing the same selector on many elements (or documents), it can be more efficient to compile and reuse an Evaluator than to reparse the selector on each invocation of select().

Evaluators are thread-safe and may be used concurrently across multiple documents.

*/ public abstract class Evaluator { protected Evaluator() { } /** Provides a Predicate for this Evaluator, matching the test Element. * @param root the root Element, for match evaluation * @return a predicate that accepts an Element to test for matches with this Evaluator * @since 1.17.1 */ public Predicate asPredicate(Element root) { return element -> matches(root, element); } Predicate asNodePredicate(Element root) { return node -> matches(root, node); } /** * Test if the element meets the evaluator's requirements. * * @param root Root of the matching subtree * @param element tested element * @return Returns true if the requirements are met or * false otherwise */ public abstract boolean matches(Element root, Element element); final boolean matches(Element root, Node node) { if (node instanceof Element) { return matches(root, (Element) node); } else if (node instanceof LeafNode && wantsNodes()) { return matches(root, (LeafNode) node); } return false; } boolean matches(Element root, LeafNode leafNode) { return false; } boolean wantsNodes() { return false; } /** Reset any internal state in this Evaluator before executing a new Collector evaluation. */ protected void reset() { } /** A relative evaluator cost function. During evaluation, Evaluators are sorted by ascending cost as an optimization. * @return the relative cost of this Evaluator */ protected int cost() { return 5; // a nominal default cost } /** * Evaluator for tag name */ public static final class Tag extends Evaluator { private final String tagName; public Tag(String tagName) { this.tagName = tagName; } @Override public boolean matches(Element root, Element element) { return (element.nameIs(tagName)); } @Override protected int cost() { return 1; } @Override public String toString() { return String.format("%s", tagName); } } /** * Evaluator for tag name that starts with prefix; used for ns|* */ public static final class TagStartsWith extends Evaluator { private final String tagName; public TagStartsWith(String tagName) { this.tagName = tagName; } @Override public boolean matches(Element root, Element element) { return (element.normalName().startsWith(tagName)); } @Override public String toString() { return String.format("%s|*", tagName); } } /** * Evaluator for tag name that ends with suffix; used for *|el */ public static final class TagEndsWith extends Evaluator { private final String tagName; public TagEndsWith(String tagName) { this.tagName = tagName; } @Override public boolean matches(Element root, Element element) { return (element.normalName().endsWith(tagName)); } @Override public String toString() { return String.format("*|%s", tagName); } } /** * Evaluator for element id */ public static final class Id extends Evaluator { private final String id; public Id(String id) { this.id = id; } @Override public boolean matches(Element root, Element element) { return (id.equals(element.id())); } @Override protected int cost() { return 2; } @Override public String toString() { return String.format("#%s", id); } } /** * Evaluator for element class */ public static final class Class extends Evaluator { private final String className; public Class(String className) { this.className = className; } @Override public boolean matches(Element root, Element element) { return (element.hasClass(className)); } @Override protected int cost() { return 8; // does whitespace scanning; more than .contains() } @Override public String toString() { return String.format(".%s", className); } } /** * Evaluator for attribute name matching */ public static final class Attribute extends Evaluator { private final String key; public Attribute(String key) { this.key = key; } @Override public boolean matches(Element root, Element element) { return element.hasAttr(key); } @Override protected int cost() { return 2; } @Override public String toString() { return String.format("[%s]", key); } } /** * Evaluator for attribute name prefix matching */ public static final class AttributeStarting extends Evaluator { private final String keyPrefix; public AttributeStarting(String keyPrefix) { Validate.notNull(keyPrefix); // OK to be empty - will find elements with any attributes this.keyPrefix = lowerCase(keyPrefix); } @Override public boolean matches(Element root, Element element) { List values = element.attributes().asList(); for (org.jsoup.nodes.Attribute attribute : values) { if (lowerCase(attribute.getKey()).startsWith(keyPrefix)) return true; } return false; } @Override protected int cost() { return 6; } @Override public String toString() { return String.format("[^%s]", keyPrefix); } } /** * Evaluator for attribute name/value matching */ public static final class AttributeWithValue extends AttributeKeyPair { public AttributeWithValue(String key, String value) { super(key, value); } @Override public boolean matches(Element root, Element element) { return element.hasAttr(key) && value.equalsIgnoreCase(element.attr(key)); } @Override protected int cost() { return 3; } @Override public String toString() { return String.format("[%s=%s]", key, value); } } /** * Evaluator for attribute name != value matching */ public static final class AttributeWithValueNot extends AttributeKeyPair { public AttributeWithValueNot(String key, String value) { super(key, value); } @Override public boolean matches(Element root, Element element) { return !value.equalsIgnoreCase(element.attr(key)); } @Override protected int cost() { return 3; } @Override public String toString() { return String.format("[%s!=%s]", key, value); } } /** * Evaluator for attribute name/value matching (value prefix) */ public static final class AttributeWithValueStarting extends AttributeKeyPair { public AttributeWithValueStarting(String key, String value) { super(key, value); } @Override public boolean matches(Element root, Element element) { return element.hasAttr(key) && lowerCase(element.attr(key)).startsWith(value); // value is lower case already } @Override protected int cost() { return 4; } @Override public String toString() { return String.format("[%s^=%s]", key, value); } } /** * Evaluator for attribute name/value matching (value ending) */ public static final class AttributeWithValueEnding extends AttributeKeyPair { public AttributeWithValueEnding(String key, String value) { super(key, value); } @Override public boolean matches(Element root, Element element) { return element.hasAttr(key) && lowerCase(element.attr(key)).endsWith(value); // value is lower case } @Override protected int cost() { return 4; } @Override public String toString() { return String.format("[%s$=%s]", key, value); } } /** * Evaluator for attribute name/value matching (value containing) */ public static final class AttributeWithValueContaining extends AttributeKeyPair { public AttributeWithValueContaining(String key, String value) { super(key, value); } @Override public boolean matches(Element root, Element element) { return element.hasAttr(key) && lowerCase(element.attr(key)).contains(value); // value is lower case } @Override protected int cost() { return 6; } @Override public String toString() { return String.format("[%s*=%s]", key, value); } } /** * Evaluator for attribute name/value matching (value regex matching) */ public static final class AttributeWithValueMatching extends Evaluator { final String key; final Regex pattern; public AttributeWithValueMatching(String key, Regex pattern) { this.key = normalize(key); this.pattern = pattern; } public AttributeWithValueMatching(String key, Pattern pattern) { this(key, Regex.fromPattern(pattern)); // api compat } @Override public boolean matches(Element root, Element element) { return element.hasAttr(key) && pattern.matcher(element.attr(key)).find(); } @Override protected int cost() { return 8; } @Override public String toString() { return String.format("[%s~=%s]", key, pattern.toString()); } } /** * Abstract evaluator for attribute name/value matching */ public abstract static class AttributeKeyPair extends Evaluator { final String key; final String value; public AttributeKeyPair(String key, String value) { Validate.notEmpty(key); Validate.notNull(value); this.key = normalize(key); boolean quoted = value.startsWith("'") && value.endsWith("'") || value.startsWith("\"") && value.endsWith("\""); if (quoted) { Validate.isTrue(value.length() > 1, "Quoted value must have content"); value = value.substring(1, value.length() - 1); } this.value = lowerCase(value); // case-insensitive match } /** @deprecated since 1.22.1, use {@link #AttributeKeyPair(String, String)}; the previous trimQuoted parameter is no longer used. This constructor will be removed in jsoup 1.24.1. */ @Deprecated public AttributeKeyPair(String key, String value, boolean ignored) { this(key, value); } } /** * Evaluator for any / all element matching */ public static final class AllElements extends Evaluator { @Override public boolean matches(Element root, Element element) { return true; } @Override protected int cost() { return 10; } @Override public String toString() { return "*"; } } /** * Evaluator for matching by sibling index number (e {@literal <} idx) */ public static final class IndexLessThan extends IndexEvaluator { public IndexLessThan(int index) { super(index); } @Override public boolean matches(Element root, Element element) { return root != element && element.elementSiblingIndex() < index; } @Override public String toString() { return String.format(":lt(%d)", index); } } /** * Evaluator for matching by sibling index number (e {@literal >} idx) */ public static final class IndexGreaterThan extends IndexEvaluator { public IndexGreaterThan(int index) { super(index); } @Override public boolean matches(Element root, Element element) { return element.elementSiblingIndex() > index; } @Override public String toString() { return String.format(":gt(%d)", index); } } /** * Evaluator for matching by sibling index number (e = idx) */ public static final class IndexEquals extends IndexEvaluator { public IndexEquals(int index) { super(index); } @Override public boolean matches(Element root, Element element) { return element.elementSiblingIndex() == index; } @Override public String toString() { return String.format(":eq(%d)", index); } } /** * Evaluator for matching the last sibling (css :last-child) */ public static final class IsLastChild extends Evaluator { @Override public boolean matches(Element root, Element element) { final Element p = element.parent(); return p != null && !(p instanceof Document) && element == p.lastElementChild(); } @Override public String toString() { return ":last-child"; } } public static final class IsFirstOfType extends IsNthOfType { public IsFirstOfType() { super(0,1); } @Override public String toString() { return ":first-of-type"; } } public static final class IsLastOfType extends IsNthLastOfType { public IsLastOfType() { super(0,1); } @Override public String toString() { return ":last-of-type"; } } public static abstract class CssNthEvaluator extends Evaluator { /** Step */ protected final int a; /** Offset */ protected final int b; public CssNthEvaluator(int step, int offset) { this.a = step; this.b = offset; } public CssNthEvaluator(int offset) { this(0, offset); } @Override public boolean matches(Element root, Element element) { final Element p = element.parent(); if (p == null || (p instanceof Document)) return false; final int pos = calculatePosition(root, element); if (a == 0) return pos == b; return (pos - b) * a >= 0 && (pos - b) % a == 0; } @Override public String toString() { String format = (a == 0) ? ":%s(%3$d)" // only offset (b) : (b == 0) ? ":%s(%2$dn)" // only step (a) : ":%s(%2$dn%3$+d)"; // step, offset return String.format(format, getPseudoClass(), a, b); } protected abstract String getPseudoClass(); protected abstract int calculatePosition(Element root, Element element); } /** * css-compatible Evaluator for :eq (css :nth-child) * * @see IndexEquals */ public static final class IsNthChild extends CssNthEvaluator { public IsNthChild(int step, int offset) { super(step, offset); } @Override protected int calculatePosition(Element root, Element element) { return element.elementSiblingIndex() + 1; } @Override protected String getPseudoClass() { return "nth-child"; } } /** * css pseudo class :nth-last-child) * * @see IndexEquals */ public static final class IsNthLastChild extends CssNthEvaluator { public IsNthLastChild(int step, int offset) { super(step, offset); } @Override protected int calculatePosition(Element root, Element element) { if (element.parent() == null) return 0; return element.parent().childrenSize() - element.elementSiblingIndex(); } @Override protected String getPseudoClass() { return "nth-last-child"; } } /** * css pseudo class nth-of-type * */ public static class IsNthOfType extends CssNthEvaluator { public IsNthOfType(int step, int offset) { super(step, offset); } @Override protected int calculatePosition(Element root, Element element) { Element parent = element.parent(); if (parent == null) return 0; int pos = 0; final int size = parent.childNodeSize(); for (int i = 0; i < size; i++) { Node node = parent.childNode(i); if (node.normalName().equals(element.normalName())) pos++; if (node == element) break; } return pos; } @Override protected String getPseudoClass() { return "nth-of-type"; } } public static class IsNthLastOfType extends CssNthEvaluator { public IsNthLastOfType(int step, int offset) { super(step, offset); } @Override protected int calculatePosition(Element root, Element element) { Element parent = element.parent(); if (parent == null) return 0; int pos = 0; Element next = element; while (next != null) { if (next.normalName().equals(element.normalName())) pos++; next = next.nextElementSibling(); } return pos; } @Override protected String getPseudoClass() { return "nth-last-of-type"; } } /** * Evaluator for matching the first sibling (css :first-child) */ public static final class IsFirstChild extends Evaluator { @Override public boolean matches(Element root, Element element) { final Element p = element.parent(); return p != null && !(p instanceof Document) && element == p.firstElementChild(); } @Override public String toString() { return ":first-child"; } } /** * css3 pseudo-class :root * @see :root selector * */ public static final class IsRoot extends Evaluator { @Override public boolean matches(Element root, Element element) { final Element r = root instanceof Document ? root.firstElementChild() : root; return element == r; } @Override protected int cost() { return 1; } @Override public String toString() { return ":root"; } } public static final class IsOnlyChild extends Evaluator { @Override public boolean matches(Element root, Element element) { final Element p = element.parent(); return p!=null && !(p instanceof Document) && element.siblingElements().isEmpty(); } @Override public String toString() { return ":only-child"; } } public static final class IsOnlyOfType extends Evaluator { @Override public boolean matches(Element root, Element element) { final Element p = element.parent(); if (p==null || p instanceof Document) return false; int pos = 0; Element next = p.firstElementChild(); while (next != null) { if (next.normalName().equals(element.normalName())) pos++; if (pos > 1) break; next = next.nextElementSibling(); } return pos == 1; } @Override public String toString() { return ":only-of-type"; } } public static final class IsEmpty extends Evaluator { @Override public boolean matches(Element root, Element el) { for (Node n = el.firstChild(); n != null; n = n.nextSibling()) { if (n instanceof TextNode) { if (!((TextNode) n).isBlank()) return false; // non-blank text: not empty } else if (!(n instanceof Comment || n instanceof XmlDeclaration || n instanceof DocumentType)) return false; // non "blank" element: not empty } return true; } @Override public String toString() { return ":empty"; } } /** * Abstract evaluator for sibling index matching * * @author ant */ public abstract static class IndexEvaluator extends Evaluator { final int index; public IndexEvaluator(int index) { this.index = index; } } /** * Evaluator for matching Element (and its descendants) text */ public static final class ContainsText extends Evaluator { private final String searchText; public ContainsText(String searchText) { this.searchText = lowerCase(normaliseWhitespace(searchText)); } @Override public boolean matches(Element root, Element element) { return lowerCase(element.text()).contains(searchText); } @Override protected int cost() { return 10; } @Override public String toString() { return String.format(":contains(%s)", searchText); } } /** * Evaluator for matching Element (and its descendants) wholeText. Neither the input nor the element text is * normalized. :containsWholeText() * @since 1.15.1. */ public static final class ContainsWholeText extends Evaluator { private final String searchText; public ContainsWholeText(String searchText) { this.searchText = searchText; } @Override public boolean matches(Element root, Element element) { return element.wholeText().contains(searchText); } @Override protected int cost() { return 10; } @Override public String toString() { return String.format(":containsWholeText(%s)", searchText); } } /** * Evaluator for matching Element (but not its descendants) wholeText. Neither the input nor the element text is * normalized. :containsWholeOwnText() * @since 1.15.1. */ public static final class ContainsWholeOwnText extends Evaluator { private final String searchText; public ContainsWholeOwnText(String searchText) { this.searchText = searchText; } @Override public boolean matches(Element root, Element element) { return element.wholeOwnText().contains(searchText); } @Override public String toString() { return String.format(":containsWholeOwnText(%s)", searchText); } } /** * Evaluator for matching Element (and its descendants) data */ public static final class ContainsData extends Evaluator { private final String searchText; public ContainsData(String searchText) { this.searchText = lowerCase(searchText); } @Override public boolean matches(Element root, Element element) { return lowerCase(element.data()).contains(searchText); // not whitespace normalized } @Override public String toString() { return String.format(":containsData(%s)", searchText); } } /** * Evaluator for matching Element's own text */ public static final class ContainsOwnText extends Evaluator { private final String searchText; public ContainsOwnText(String searchText) { this.searchText = lowerCase(normaliseWhitespace(searchText)); } @Override public boolean matches(Element root, Element element) { return lowerCase(element.ownText()).contains(searchText); } @Override public String toString() { return String.format(":containsOwn(%s)", searchText); } } /** * Evaluator for matching Element (and its descendants) text with regex */ public static final class Matches extends Evaluator { private final Regex pattern; public Matches(Regex pattern) { this.pattern = pattern; } public Matches(Pattern pattern) { this(Regex.fromPattern(pattern)); } @Override public boolean matches(Element root, Element element) { return pattern.matcher(element.text()).find(); } @Override protected int cost() { return 8; } @Override public String toString() { return String.format(":matches(%s)", pattern); } } /** * Evaluator for matching Element's own text with regex */ public static final class MatchesOwn extends Evaluator { private final Regex pattern; public MatchesOwn(Regex pattern) { this.pattern = pattern; } public MatchesOwn(Pattern pattern) { this(Regex.fromPattern(pattern)); } @Override public boolean matches(Element root, Element element) { return pattern.matcher(element.ownText()).find(); } @Override protected int cost() { return 7; } @Override public String toString() { return String.format(":matchesOwn(%s)", pattern); } } /** * Evaluator for matching Element (and its descendants) whole text with regex. * @since 1.15.1. */ public static final class MatchesWholeText extends Evaluator { private final Regex pattern; public MatchesWholeText(Regex pattern) { this.pattern = pattern; } public MatchesWholeText(Pattern pattern) { this.pattern = Regex.fromPattern(pattern); } @Override public boolean matches(Element root, Element element) { return pattern.matcher(element.wholeText()).find(); } @Override protected int cost() { return 8; } @Override public String toString() { return String.format(":matchesWholeText(%s)", pattern); } } /** * Evaluator for matching Element's own whole text with regex. * @since 1.15.1. */ public static final class MatchesWholeOwnText extends Evaluator { private final Regex pattern; public MatchesWholeOwnText(Regex pattern) { this.pattern = pattern; } public MatchesWholeOwnText(Pattern pattern) { this(Regex.fromPattern(pattern)); } @Override public boolean matches(Element root, Element element) { Regex.Matcher m = pattern.matcher(element.wholeOwnText()); return m.find(); } @Override protected int cost() { return 7; } @Override public String toString() { return String.format(":matchesWholeOwnText(%s)", pattern); } } /** @deprecated This selector is deprecated and will be removed in jsoup 1.24.1. Migrate to ::textnode using the Element#selectNodes() method instead. */ @Deprecated public static final class MatchText extends Evaluator { private static boolean loggedError = false; public MatchText() { // log a deprecated error on first use; users typically won't directly construct this Evaluator and so won't otherwise get deprecation warnings if (!loggedError) { loggedError = true; System.err.println("WARNING: :matchText selector is deprecated and will be removed in jsoup 1.24.1. Use Element#selectNodes(String, Class) with selector ::textnode and class TextNode instead."); } } @Override public boolean matches(Element root, Element element) { if (element instanceof PseudoTextElement) return true; List textNodes = element.textNodes(); for (TextNode textNode : textNodes) { PseudoTextElement pel = new PseudoTextElement( org.jsoup.parser.Tag.valueOf(element.tagName(), element.tag().namespace(), ParseSettings.preserveCase), element.baseUri(), element.attributes()); textNode.replaceWith(pel); pel.appendChild(textNode); } return false; } @Override protected int cost() { return -1; // forces first evaluation, which prepares the DOM for later evaluator matches } @Override public String toString() { return ":matchText"; } } } ================================================ FILE: src/main/java/org/jsoup/select/NodeEvaluator.java ================================================ package org.jsoup.select; import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Element; import org.jsoup.nodes.LeafNode; import org.jsoup.nodes.Node; import org.jsoup.helper.Regex; import static org.jsoup.internal.Normalizer.lowerCase; import static org.jsoup.internal.StringUtil.normaliseWhitespace; abstract class NodeEvaluator extends Evaluator { @Override public boolean matches(Element root, Element element) { return evaluateMatch(element); } @Override boolean matches(Element root, LeafNode leaf) { return evaluateMatch(leaf); } abstract boolean evaluateMatch(Node node); @Override boolean wantsNodes() { return true; } static class InstanceType extends NodeEvaluator { final java.lang.Class type; final String selector; InstanceType(java.lang.Class type, String selector) { super(); this.type = type; this.selector = "::" + selector; } @Override boolean evaluateMatch(Node node) { return type.isInstance(node); } @Override protected int cost() { return 1; } @Override public String toString() { return selector; } } static class ContainsValue extends NodeEvaluator { private final String searchText; public ContainsValue(String searchText) { this.searchText = lowerCase(normaliseWhitespace(searchText)); } @Override boolean evaluateMatch(Node node) { return lowerCase(node.nodeValue()).contains(searchText); } @Override protected int cost() { return 6; } @Override public String toString() { return String.format(":contains(%s)", searchText); } } /** Matches nodes with no value or only whitespace. */ static class BlankValue extends NodeEvaluator { @Override boolean evaluateMatch(Node node) { return StringUtil.isBlank(node.nodeValue()); } @Override protected int cost() { return 4; } @Override public String toString() { return ":blank"; } } static class MatchesValue extends NodeEvaluator { private final Regex pattern; protected MatchesValue(Regex pattern) { this.pattern = pattern; } @Override boolean evaluateMatch(Node node) { return pattern.matcher(node.nodeValue()).find(); } @Override protected int cost() { return 8; } @Override public String toString() { return String.format(":matches(%s)", pattern); } } } ================================================ FILE: src/main/java/org/jsoup/select/NodeFilter.java ================================================ package org.jsoup.select; import org.jsoup.nodes.Node; /** A controllable Node visitor interface. Execute via {@link #traverse(Node)}.

This interface provides two methods, {@code head} and {@code tail}. The head method is called when a node is first seen, and the tail method when all that node's children have been visited.

For each visited node, the resulting action may be:

  • continue ({@link FilterResult#CONTINUE}),
  • skip all children ({@link FilterResult#SKIP_CHILDREN}),
  • skip node entirely ({@link FilterResult#SKIP_ENTIRELY}),
  • remove the subtree ({@link FilterResult#REMOVE}),
  • interrupt the iteration and return ({@link FilterResult#STOP}).
The difference between {@link FilterResult#SKIP_CHILDREN} and {@link FilterResult#SKIP_ENTIRELY} is that the first will invoke {@link NodeFilter#tail(Node, int)} on the node, while the latter will not. Within {@link NodeFilter#tail(Node, int)}, both are equivalent to {@link FilterResult#CONTINUE}.

*/ public interface NodeFilter { /** Traversal action. */ enum FilterResult { /** Continue processing the tree */ CONTINUE, /** Skip the child nodes, but do call {@link NodeFilter#tail(Node, int)} next. */ SKIP_CHILDREN, /** Skip the subtree, and do not call {@link NodeFilter#tail(Node, int)}. */ SKIP_ENTIRELY, /** Remove the node and its children */ REMOVE, /** Stop processing */ STOP } /** * Callback for when a node is first visited. * @param node the node being visited. * @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node of that will have depth 1. * @return Traversal action */ FilterResult head(Node node, int depth); /** * Callback for when a node is last visited, after all of its descendants have been visited. *

This method has a default implementation to return {@link FilterResult#CONTINUE}.

* @param node the node being visited. * @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node of that will have depth 1. * @return Traversal action */ default FilterResult tail(Node node, int depth) { return FilterResult.CONTINUE; } /** Run a depth-first controlled traverse of the root and all of its descendants. @param root the initial node point to traverse. @since 1.21.1 */ default void traverse(Node root) { NodeTraversor.filter(this, root); } } ================================================ FILE: src/main/java/org/jsoup/select/NodeTraversor.java ================================================ package org.jsoup.select; import org.jsoup.helper.Validate; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.select.NodeFilter.FilterResult; /** A depth-first node traversor. Use to walk through all nodes under and including the specified root node, in document order. The {@link NodeVisitor#head(Node, int)} and {@link NodeVisitor#tail(Node, int)} methods will be called for each node.

During the head() visit, DOM structural changes around the node currently being visited are supported, including {@link Node#replaceWith(Node)} and {@link Node#remove()}. See {@link NodeVisitor#head(Node, int) head()} for the traversal contract after mutation. Other non-structural node changes are also supported.

DOM structural changes to the current node are not supported during the tail() visit.

*/ public class NodeTraversor { // cursor state private static final byte VisitHead = 0; private static final byte AfterHead = 1; private static final byte VisitTail = 2; /** Run a depth-first traverse of the root and all of its descendants. @param visitor Node visitor. @param root the initial node point to traverse. @see NodeVisitor#traverse(Node root) */ public static void traverse(NodeVisitor visitor, Node root) { Validate.notNull(visitor); Validate.notNull(root); Node node = root; final Node rootNext = root.nextSibling(); // don't traverse siblings beyond the original root int depth = 0; byte state = VisitHead; while (true) { if (state == VisitHead) { // snapshot the current cursor position so we can recover if head() structurally changes it: Node parent = node.parentNode(); Node next = node.nextSibling(); int sibIndex = parent != null ? node.siblingIndex() : 0; visitor.head(node, depth); // any structural changes? if (parent != null && node.parentNode() != parent) { // removed / replaced / moved Node occupant = sibIndex < parent.childNodeSize() ? parent.childNode(sibIndex) : null; // ^^ the node now at this node's former position Node boundary = depth == 0 ? rootNext : next; // don't advance beyond this node when resuming if (occupant != null && occupant != boundary) { node = occupant; state = AfterHead; // continue from that slot without re-heading it } else if (depth == 0) { // root detached or replaced break; } else if (next != null && next.parentNode() == parent) { node = next; // old slot is empty or shifted to the original next, visit } else { // removed last child; tail the parent next node = parent; depth--; state = VisitTail; } } else { state = AfterHead; } continue; // next loop handles the updated node/state } if (state == AfterHead && node.childNodeSize() > 0) { // descend into current children node = node.childNode(0); depth++; state = VisitHead; continue; } visitor.tail(node, depth); Node next = node.nextSibling(); if (depth == 0) { if (next == null || next == rootNext) break; // done with the original root range node = next; state = VisitHead; } else if (next != null) { // traverse siblings node = next; state = VisitHead; } else { // no siblings left, ascend node = node.parentNode(); depth--; state = VisitTail; } } } /** Run a depth-first traversal of each Element. @param visitor Node visitor. @param elements Elements to traverse. */ public static void traverse(NodeVisitor visitor, Elements elements) { Validate.notNull(visitor); Validate.notNull(elements); for (Element el : elements) traverse(visitor, el); } /** Run a depth-first filtered traversal of the root and all of its descendants. @param filter NodeFilter visitor. @param root the root node point to traverse. @return The filter result of the root node, or {@link FilterResult#STOP}. @see NodeFilter */ public static FilterResult filter(NodeFilter filter, Node root) { Node node = root; int depth = 0; while (node != null) { FilterResult result = filter.head(node, depth); if (result == FilterResult.STOP) return result; // Descend into child nodes: if (result == FilterResult.CONTINUE && node.childNodeSize() > 0) { node = node.childNode(0); ++depth; continue; } // No siblings, move upwards: while (true) { assert node != null; // depth > 0, so has parent if (!(node.nextSibling() == null && depth > 0)) break; // 'tail' current node: if (result == FilterResult.CONTINUE || result == FilterResult.SKIP_CHILDREN) { result = filter.tail(node, depth); if (result == FilterResult.STOP) return result; } Node prev = node; // In case we need to remove it below. node = node.parentNode(); depth--; if (result == FilterResult.REMOVE) prev.remove(); // Remove AFTER finding parent. result = FilterResult.CONTINUE; // Parent was not pruned. } // 'tail' current node, then proceed with siblings: if (result == FilterResult.CONTINUE || result == FilterResult.SKIP_CHILDREN) { result = filter.tail(node, depth); if (result == FilterResult.STOP) return result; } if (node == root) return result; Node prev = node; // In case we need to remove it below. node = node.nextSibling(); if (result == FilterResult.REMOVE) prev.remove(); // Remove AFTER finding sibling. } // root == null? return FilterResult.CONTINUE; } /** Run a depth-first filtered traversal of each Element. @param filter NodeFilter visitor. @see NodeFilter */ public static void filter(NodeFilter filter, Elements elements) { Validate.notNull(filter); Validate.notNull(elements); for (Element el : elements) if (filter(filter, el) == FilterResult.STOP) break; } } ================================================ FILE: src/main/java/org/jsoup/select/NodeVisitor.java ================================================ package org.jsoup.select; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; /** Node visitor interface, used to walk the DOM and visit each node. Execute via {@link #traverse(Node)} or {@link Node#traverse(NodeVisitor)}. The traversal is depth-first.

This interface provides two methods, {@link #head} and {@link #tail}. The head method is called when a node is first seen, and the tail method when all that node's children have been visited. As an example, {@code head} can be used to emit a start tag for a node, and {@code tail} to emit the end tag. The {@code tail} method defaults to a no-op, so this interface can be used as a {@link FunctionalInterface}, with {@code head} as its single abstract method.

Example:


 doc.body().traverse((node, depth) -> {
     switch (node) {
         case Element el     -> print(el.tag() + ": " + el.ownText());
         case DataNode data  -> print("Data: " + data.getWholeData());
         default             -> print(node.nodeName() + " at depth " + depth);
     }
 });
 
*/ @FunctionalInterface public interface NodeVisitor { /** Callback for when a node is first visited.

The node may be modified (for example via {@link Node#attr(String)}), removed with {@link Node#remove()}, or replaced with {@link Node#replaceWith(Node)}. If the node is an {@link Element}, you may cast it and access those methods.

Traversal uses a forward cursor. After {@code head()} completes:

  • If the current node is still attached, traversal continues into its current children and then its following siblings. Nodes inserted before the current node are not visited.
  • If the current node was detached and another node now occupies its former sibling position, the node now at that position is not passed to {@code head()} again. Traversal continues from there: its children are visited, then the node is passed to {@link #tail(Node, int)}, then later siblings are visited.
  • If the current node was detached and no node occupies its former sibling position, the current node is not passed to {@code tail()}, and traversal resumes at the node that originally followed it.

Traversal never advances outside the original root subtree. If the traversal root is detached during {@code head()}, traversal stops at the original root boundary.

@param node the node being visited. @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node of that will have depth 1. */ void head(Node node, int depth); /** Callback for when a node is last visited, after all of its descendants have been visited.

This method defaults to a no-op.

The node passed to {@code tail()} is the node at the current traversal position when the subtree completes. If {@code head()} replaced the original node, this may be the replacement node instead.

Structural changes to the current node are not supported during {@code tail()}.

@param node the node being visited. @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node of that will have depth 1. */ default void tail(Node node, int depth) { // no-op by default, to allow just specifying the head() method } /** Run a depth-first traverse of the root and all of its descendants. @param root the initial node point to traverse. @since 1.21.1 */ default void traverse(Node root) { NodeTraversor.traverse(this, root); } } ================================================ FILE: src/main/java/org/jsoup/select/Nodes.java ================================================ package org.jsoup.select; import org.jsoup.helper.Validate; import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jspecify.annotations.Nullable; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.function.Predicate; import java.util.function.UnaryOperator; /** A list of {@link Node} objects, with methods that act on every node in the list.

Methods that {@link #set(int, T) set}, {@link #remove(int) remove}, or {@link #replaceAll(UnaryOperator) replace} nodes in the list will also act on the underlying {@link org.jsoup.nodes.Document DOM}.

If there are other bulk methods (perhaps from Elements) that would be useful here, please provide feedback.

@see Element#selectNodes(String) @see Element#selectNodes(String, Class) @since 1.21.1 */ public class Nodes extends ArrayList { public Nodes() { } public Nodes(int initialCapacity) { super(initialCapacity); } public Nodes(Collection nodes) { super(nodes); } public Nodes(List nodes) { super(nodes); } @SafeVarargs public Nodes(T... nodes) { super(Arrays.asList(nodes)); } /** * Creates a deep copy of these nodes. * @return a deep copy */ @Override public Nodes clone() { Nodes clone = new Nodes<>(size()); for (T node : this) clone.add((T) node.clone()); return clone; } /** Convenience method to get the Nodes as a plain ArrayList. This allows modification to the list of nodes without modifying the source Document. I.e. whereas calling {@code nodes.remove(0)} will remove the nodes from both the Nodes and the DOM, {@code nodes.asList().remove(0)} will remove the node from the list only.

Each Node is still the same DOM connected Node.

@return a new ArrayList containing the nodes in this list @see #Nodes(List) */ public ArrayList asList() { return new ArrayList<>(this); } /** Remove each matched node from the DOM.

The nodes will still be retained in this list, in case further processing of them is desired.

E.g. HTML: {@code

Hello

there

}
doc.select("p").remove();
HTML = {@code
}

Note that this method should not be used to clean user-submitted HTML; rather, use {@link org.jsoup.safety.Cleaner} to clean HTML. @return this, for chaining @see Element#empty() @see Elements#empty() @see #clear() */ public Nodes remove() { for (T node : this) { node.remove(); } return this; } /** Get the combined outer HTML of all matched nodes. @return string of all node's outer HTML. @see Elements#text() @see Elements#html() */ public String outerHtml() { return stream() .map(Node::outerHtml) .collect(StringUtil.joining("\n")); } /** Get the combined outer HTML of all matched nodes. Alias of {@link #outerHtml()}. @return string of all the node's outer HTML. @see Elements#text() @see #outerHtml() */ @Override public String toString() { return outerHtml(); } /** Insert the supplied HTML before each matched node's outer HTML. @param html HTML to insert before each node @return this, for chaining @see Element#before(String) */ public Nodes before(String html) { for (T node : this) { node.before(html); } return this; } /** Insert the supplied HTML after each matched nodes's outer HTML. @param html HTML to insert after each node @return this, for chaining @see Element#after(String) */ public Nodes after(String html) { for (T node : this) { node.after(html); } return this; } /** Wrap the supplied HTML around each matched node. For example, with HTML {@code

This is Jsoup

}, doc.select("b").wrap("<i></i>"); becomes {@code

This is jsoup

} @param html HTML to wrap around each node, e.g. {@code
}. Can be arbitrarily deep. @return this (for chaining) @see Element#wrap */ public Nodes wrap(String html) { Validate.notEmpty(html); for (T node : this) { node.wrap(html); } return this; } // list-like methods /** Get the first matched element. @return The first matched element, or null if contents is empty. */ public @Nullable T first() { return isEmpty() ? null : get(0); } /** Get the last matched element. @return The last matched element, or null if contents is empty. */ public @Nullable T last() { return isEmpty() ? null : get(size() - 1); } // ArrayList methods that update the DOM: /** Replace the node at the specified index in this list, and in the DOM. @param index index of the node to replace @param node node to be stored at the specified position @return the old Node at this index */ @Override public T set(int index, T node) { Validate.notNull(node); T old = super.set(index, node); old.replaceWith(node); return old; } /** Remove the node at the specified index in this list, and from the DOM. @param index the index of the node to be removed @return the old node at this index @see #deselect(int) */ @Override public T remove(int index) { T old = super.remove(index); old.remove(); return old; } /** Remove the specified node from this list, and from the DOM. @param o node to be removed from this list, if present @return if this list contained the Node @see #deselect(Object) */ @Override public boolean remove(Object o) { int index = super.indexOf(o); if (index == -1) { return false; } else { remove(index); return true; } } /** Remove the node at the specified index in this list, but not from the DOM. @param index the index of the node to be removed @return the old node at this index @see #remove(int) */ public T deselect(int index) { return super.remove(index); } /** Remove the specified node from this list, but not from the DOM. @param o node to be removed from this list, if present @return if this list contained the Node @see #remove(Object) */ public boolean deselect(Object o) { return super.remove(o); } /** Removes all the nodes from this list, and each of them from the DOM. @see #deselectAll() */ @Override public void clear() { remove(); super.clear(); } /** Like {@link #clear()}, removes all the nodes from this list, but not from the DOM. @see #clear() */ public void deselectAll() { super.clear(); } /** Removes from this list, and from the DOM, each of the nodes that are contained in the specified collection and are in this list. @param c collection containing nodes to be removed from this list @return {@code true} if nodes were removed from this list */ @Override public boolean removeAll(Collection c) { boolean anyRemoved = false; for (Object o : c) { anyRemoved |= this.remove(o); } return anyRemoved; } /** Retain in this list, and in the DOM, only the nodes that are in the specified collection and are in this list. In other words, remove nodes from this list and the DOM any item that is in this list but not in the specified collection. @param toRemove collection containing nodes to be retained in this list @return {@code true} if nodes were removed from this list @since 1.17.1 */ @Override public boolean retainAll(Collection toRemove) { boolean anyRemoved = false; for (Iterator it = this.iterator(); it.hasNext(); ) { T el = it.next(); if (!toRemove.contains(el)) { it.remove(); anyRemoved = true; } } return anyRemoved; } /** Remove from the list, and from the DOM, all nodes in this list that mach the given predicate. @param filter a predicate which returns {@code true} for nodes to be removed @return {@code true} if nodes were removed from this list */ @Override public boolean removeIf(Predicate filter) { boolean anyRemoved = false; for (Iterator it = this.iterator(); it.hasNext(); ) { T node = it.next(); if (filter.test(node)) { it.remove(); anyRemoved = true; } } return anyRemoved; } /** Replace each node in this list with the result of the operator, and update the DOM. @param operator the operator to apply to each node */ @Override public void replaceAll(UnaryOperator operator) { for (int i = 0; i < this.size(); i++) { this.set(i, operator.apply(this.get(i))); } } } ================================================ FILE: src/main/java/org/jsoup/select/QueryParser.java ================================================ package org.jsoup.select; import org.jsoup.helper.Regex; import org.jsoup.internal.StringUtil; import org.jsoup.helper.Validate; import org.jsoup.nodes.CDataNode; import org.jsoup.nodes.Comment; import org.jsoup.nodes.DataNode; import org.jsoup.nodes.LeafNode; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.parser.TokenQueue; import org.jspecify.annotations.Nullable; import java.util.function.Function; import java.util.regex.Matcher; import java.util.regex.Pattern; import static org.jsoup.select.StructuralEvaluator.ImmediateParentRun; import static org.jsoup.internal.Normalizer.normalize; /** * Parses a CSS selector into an Evaluator tree. */ public class QueryParser implements AutoCloseable { private final static char[] Combinators = {'>', '+', '~'}; // ' ' is also a combinator, but found implicitly private final static String[] AttributeEvals = new String[]{"=", "!=", "^=", "$=", "*=", "~="}; private final static char[] SequenceEnders = {',', ')'}; private final TokenQueue tq; private final String query; private boolean inNodeContext; // ::comment:contains should act on node value, vs element text /** * Create a new QueryParser. * @param query CSS query */ private QueryParser(String query) { Validate.notEmpty(query); query = query.trim(); this.query = query; this.tq = new TokenQueue(query); } /** Parse a CSS query into an Evaluator. If you are evaluating the same query repeatedly, it may be more efficient to parse it once and reuse the Evaluator. @param query CSS query @return Evaluator @see Selector selector query syntax @throws Selector.SelectorParseException if the CSS query is invalid */ public static Evaluator parse(String query) { try (QueryParser p = new QueryParser(query)) { return p.parse(); } catch (IllegalArgumentException e) { throw new Selector.SelectorParseException(e.getMessage()); } } /** Parse the query. We use this simplified expression of the grammar:
     SelectorGroup   ::= Selector (',' Selector)*
     Selector        ::= [ Combinator ] SimpleSequence ( Combinator SimpleSequence )*
     SimpleSequence  ::= [ TypeSelector ] ( ID | Class | Attribute | Pseudo )*
     Pseudo           ::= ':' Name [ '(' SelectorGroup ')' ]
     Combinator      ::= S+         // descendant (whitespace)
     | '>'       // child
     | '+'       // adjacent sibling
     | '~'       // general sibling
     
See selectors-4 for the real thing */ Evaluator parse() { Evaluator eval = parseSelectorGroup(); tq.consumeWhitespace(); if (!tq.isEmpty()) throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); return eval; } Evaluator parseSelectorGroup() { // SelectorGroup. Into an Or if > 1 Selector Evaluator left = parseSelector(); while (tq.matchChomp(',')) { Evaluator right = parseSelector(); left = or(left, right); } return left; } Evaluator parseSelector() { // Selector ::= [ Combinator ] SimpleSequence ( Combinator SimpleSequence )* tq.consumeWhitespace(); Evaluator left; if (tq.matchesAny(Combinators)) { // e.g. query is "> div"; left side is root element left = new StructuralEvaluator.Root(); } else { left = parseSimpleSequence(); } while (true) { char combinator = 0; if (tq.consumeWhitespace()) combinator = ' '; // maybe descendant? if (tq.matchesAny(Combinators)) // no, explicit combinator = tq.consume(); else if (tq.matchesAny(SequenceEnders)) // , - space after simple like "foo , bar"; ) - close of :has() break; if (combinator != 0) { Evaluator right = parseSimpleSequence(); left = combinator(left, combinator, right); } else { break; } } return left; } Evaluator parseSimpleSequence() { // SimpleSequence ::= TypeSelector? ( Hash | Class | Pseudo )* Evaluator left = null; tq.consumeWhitespace(); // one optional type selector if (tq.matchesWord() || tq.matches("*|")) left = byTag(); else if (tq.matchChomp('*')) left = new Evaluator.AllElements(); // zero or more subclasses (#, ., [) while(true) { Evaluator right = parseSubclass(); if (right != null) { left = and(left, right); } else break; // no more simple tokens } if (left == null) throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); return left; } static Evaluator combinator(Evaluator left, char combinator, Evaluator right) { switch (combinator) { case '>': ImmediateParentRun run = left instanceof ImmediateParentRun ? (ImmediateParentRun) left : new ImmediateParentRun(left); run.add(right); return run; case ' ': return and(new StructuralEvaluator.Ancestor(left), right); case '+': return and(new StructuralEvaluator.ImmediatePreviousSibling(left), right); case '~': return and(new StructuralEvaluator.PreviousSibling(left), right); default: throw new Selector.SelectorParseException("Unknown combinator '%s'", combinator); } } @Nullable Evaluator parseSubclass() { // Subclass: ID | Class | Attribute | Pseudo if (tq.matchChomp('#')) return byId(); else if (tq.matchChomp('.')) return byClass(); else if (tq.matches('[')) return byAttribute(); else if (tq.matchChomp("::")) return parseNodeSelector(); // ::comment etc else if (tq.matchChomp(':')) return parsePseudoSelector(); else return null; } /** Merge two evals into an Or. */ static Evaluator or(Evaluator left, Evaluator right) { if (left instanceof CombiningEvaluator.Or) { ((CombiningEvaluator.Or) left).add(right); return left; } return new CombiningEvaluator.Or(left, right); } /** Merge two evals into an And. */ static Evaluator and(@Nullable Evaluator left, Evaluator right) { if (left == null) return right; if (left instanceof CombiningEvaluator.And) { ((CombiningEvaluator.And) left).add(right); return left; } return new CombiningEvaluator.And(left, right); } private Evaluator parsePseudoSelector() { final String pseudo = tq.consumeCssIdentifier(); switch (pseudo) { case "lt": return new Evaluator.IndexLessThan(consumeIndex()); case "gt": return new Evaluator.IndexGreaterThan(consumeIndex()); case "eq": return new Evaluator.IndexEquals(consumeIndex()); case "has": return has(); case "is": return is(); case "contains": return contains(false); case "containsOwn": return contains(true); case "containsWholeText": return containsWholeText(false); case "containsWholeOwnText": return containsWholeText(true); case "containsData": return containsData(); case "matches": return matches(false); case "matchesOwn": return matches(true); case "matchesWholeText": return matchesWholeText(false); case "matchesWholeOwnText": return matchesWholeText(true); case "not": return not(); case "nth-child": return cssNthChild(false, false); case "nth-last-child": return cssNthChild(true, false); case "nth-of-type": return cssNthChild(false, true); case "nth-last-of-type": return cssNthChild(true, true); case "first-child": return new Evaluator.IsFirstChild(); case "last-child": return new Evaluator.IsLastChild(); case "first-of-type": return new Evaluator.IsFirstOfType(); case "last-of-type": return new Evaluator.IsLastOfType(); case "only-child": return new Evaluator.IsOnlyChild(); case "only-of-type": return new Evaluator.IsOnlyOfType(); case "empty": return new Evaluator.IsEmpty(); case "blank": return new NodeEvaluator.BlankValue(); case "root": return new Evaluator.IsRoot(); case "matchText": return new Evaluator.MatchText(); default: throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); } } // ::comment etc private Evaluator parseNodeSelector() { final String pseudo = tq.consumeCssIdentifier(); inNodeContext = true; // Enter node context Evaluator left; switch (pseudo) { case "node": left = new NodeEvaluator.InstanceType(Node.class, pseudo); break; case "leafnode": left = new NodeEvaluator.InstanceType(LeafNode.class, pseudo); break; case "text": left = new NodeEvaluator.InstanceType(TextNode.class, pseudo); break; case "comment": left = new NodeEvaluator.InstanceType(Comment.class, pseudo); break; case "data": left = new NodeEvaluator.InstanceType(DataNode.class, pseudo); break; case "cdata": left = new NodeEvaluator.InstanceType(CDataNode.class, pseudo); break; default: throw new Selector.SelectorParseException( "Could not parse query '%s': unknown node type '::%s'", query, pseudo); } // Handle following subclasses in node context (like ::comment:contains()) Evaluator right; while ((right = parseSubclass()) != null) { left = and(left, right); } inNodeContext = false; return left; } private Evaluator byId() { String id = tq.consumeCssIdentifier(); Validate.notEmpty(id); return new Evaluator.Id(id); } private Evaluator byClass() { String className = tq.consumeCssIdentifier(); Validate.notEmpty(className); return new Evaluator.Class(className.trim()); } private Evaluator byTag() { // todo - these aren't dealing perfectly with case sensitivity. For case sensitive parsers, we should also make // the tag in the selector case-sensitive (and also attribute names). But for now, normalize (lower-case) for // consistency - both the selector and the element tag String tagName = normalize(tq.consumeElementSelector()); Validate.notEmpty(tagName); // namespaces: if (tagName.startsWith("*|")) { // namespaces: wildcard match equals(tagName) or ending in ":"+tagName String plainTag = tagName.substring(2); // strip *| return new CombiningEvaluator.Or( new Evaluator.Tag(plainTag), new Evaluator.TagEndsWith(":" + plainTag) ); } else if (tagName.endsWith("|*")) { // ns|* String ns = tagName.substring(0, tagName.length() - 2) + ":"; // strip |*, to ns: return new Evaluator.TagStartsWith(ns); } else if (tagName.contains("|")) { // flip "abc|def" to "abc:def" tagName = tagName.replace("|", ":"); } return new Evaluator.Tag(tagName); } private Evaluator byAttribute() { try (TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']'))) { return evaluatorForAttribute(cq); } } private Evaluator evaluatorForAttribute(TokenQueue cq) { String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val) key = normalize(key); Validate.notEmpty(key); Validate.isFalse(key.equals("abs:"), "Absolute attribute key must have a name"); cq.consumeWhitespace(); final Evaluator eval; if (cq.isEmpty()) { if (key.startsWith("^")) eval = new Evaluator.AttributeStarting(key.substring(1)); else if (key.equals("*")) // any attribute eval = new Evaluator.AttributeStarting(""); else eval = new Evaluator.Attribute(key); } else { if (cq.matchChomp('=')) eval = new Evaluator.AttributeWithValue(key, cq.remainder()); else if (cq.matchChomp("!=")) eval = new Evaluator.AttributeWithValueNot(key, cq.remainder()); else if (cq.matchChomp("^=")) eval = new Evaluator.AttributeWithValueStarting(key, cq.remainder()); else if (cq.matchChomp("$=")) eval = new Evaluator.AttributeWithValueEnding(key, cq.remainder()); else if (cq.matchChomp("*=")) eval = new Evaluator.AttributeWithValueContaining(key, cq.remainder()); else if (cq.matchChomp("~=")) eval = new Evaluator.AttributeWithValueMatching(key, Regex.compile(cq.remainder())); else throw new Selector.SelectorParseException( "Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder()); } return eval; } //pseudo selectors :first-child, :last-child, :nth-child, ... private static final Pattern NthStepOffset = Pattern.compile("(([+-])?(\\d+)?)n(\\s*([+-])?\\s*\\d+)?", Pattern.CASE_INSENSITIVE); private static final Pattern NthOffset = Pattern.compile("([+-])?(\\d+)"); private Evaluator cssNthChild(boolean last, boolean ofType) { String arg = normalize(consumeParens()); // arg is like "odd", or "-n+2", within nth-child(odd) final int step, offset; if ("odd".equals(arg)) { step = 2; offset = 1; } else if ("even".equals(arg)) { step = 2; offset = 0; } else { Matcher stepOffsetM, stepM; if ((stepOffsetM = NthStepOffset.matcher(arg)).matches()) { if (stepOffsetM.group(3) != null) // has digits, like 3n+2 or -3n+2 step = Integer.parseInt(stepOffsetM.group(1).replaceFirst("^\\+", "")); else // no digits, might be like n+2, or -n+2. if group(2) == "-", it’s -1; step = "-".equals(stepOffsetM.group(2)) ? -1 : 1; offset = stepOffsetM.group(4) != null ? Integer.parseInt(stepOffsetM.group(4).replaceFirst("^\\+", "")) : 0; } else if ((stepM = NthOffset.matcher(arg)).matches()) { step = 0; offset = Integer.parseInt(stepM.group().replaceFirst("^\\+", "")); } else { throw new Selector.SelectorParseException("Could not parse nth-index '%s': unexpected format", arg); } } return ofType ? (last ? new Evaluator.IsNthLastOfType(step, offset) : new Evaluator.IsNthOfType(step, offset)) : (last ? new Evaluator.IsNthLastChild(step, offset) : new Evaluator.IsNthChild(step, offset)); } private String consumeParens() { return tq.chompBalanced('(', ')'); } private int consumeIndex() { String index = consumeParens().trim(); Validate.isTrue(StringUtil.isNumeric(index), "Index must be numeric"); return Integer.parseInt(index); } // pseudo selector :has(el) private Evaluator has() { return parseNested(StructuralEvaluator.Has::new, ":has() must have a selector"); } // pseudo selector :is() private Evaluator is() { return parseNested(StructuralEvaluator.Is::new, ":is() must have a selector"); } private Evaluator parseNested(Function func, String err) { Validate.isTrue(tq.matchChomp('('), err); Evaluator eval = parseSelectorGroup(); Validate.isTrue(tq.matchChomp(')'), err); return func.apply(eval); } // pseudo selector :contains(text), containsOwn(text) private Evaluator contains(boolean own) { String query = own ? ":containsOwn" : ":contains"; String searchText = TokenQueue.unescape(consumeParens()); Validate.notEmpty(searchText, query + "(text) query must not be empty"); if (inNodeContext) return new NodeEvaluator.ContainsValue(searchText); return own ? new Evaluator.ContainsOwnText(searchText) : new Evaluator.ContainsText(searchText); } private Evaluator containsWholeText(boolean own) { String query = own ? ":containsWholeOwnText" : ":containsWholeText"; String searchText = TokenQueue.unescape(consumeParens()); Validate.notEmpty(searchText, query + "(text) query must not be empty"); return own ? new Evaluator.ContainsWholeOwnText(searchText) : new Evaluator.ContainsWholeText(searchText); } // pseudo selector :containsData(data) private Evaluator containsData() { String searchText = TokenQueue.unescape(consumeParens()); Validate.notEmpty(searchText, ":containsData(text) query must not be empty"); return new Evaluator.ContainsData(searchText); } // :matches(regex), matchesOwn(regex) private Evaluator matches(boolean own) { String query = own ? ":matchesOwn" : ":matches"; String regex = consumeParens(); // don't unescape, as regex bits will be escaped Validate.notEmpty(regex, query + "(regex) query must not be empty"); Regex pattern = Regex.compile(regex); if (inNodeContext) return new NodeEvaluator.MatchesValue(pattern); return own ? new Evaluator.MatchesOwn(pattern) : new Evaluator.Matches(pattern); } // :matches(regex), matchesOwn(regex) private Evaluator matchesWholeText(boolean own) { String query = own ? ":matchesWholeOwnText" : ":matchesWholeText"; String regex = consumeParens(); // don't unescape, as regex bits will be escaped Validate.notEmpty(regex, query + "(regex) query must not be empty"); Regex pattern = Regex.compile(regex); return own ? new Evaluator.MatchesWholeOwnText(pattern) : new Evaluator.MatchesWholeText(pattern); } // :not(selector) private Evaluator not() { String subQuery = consumeParens(); Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty"); return new StructuralEvaluator.Not(parse(subQuery)); } @Override public String toString() { return query; } @Override public void close() { tq.close(); } } ================================================ FILE: src/main/java/org/jsoup/select/Selector.java ================================================ package org.jsoup.select; import org.jsoup.helper.Validate; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.parser.TokenQueue; import org.jspecify.annotations.Nullable; import java.util.Collection; import java.util.HashSet; import java.util.stream.Stream; /** CSS element selector, that finds elements matching a query.

Selector syntax

A selector is a chain of simple selectors, separated by combinators. Selectors are case-insensitive (including against elements, attributes, and attribute values).

The universal selector {@code *} is implicit when no element selector is supplied (i.e. {@code .header} and {@code *.header} are equivalent).

You can easily test different selectors using the Try jsoup online playground.
PatternMatchesExample
*any element*
tagelements with the given tag namediv
*|Eelements of type E in any namespace (including non-namespaced)*|name finds <dc:name> and <name> elements
ns|Eelements of type E in the namespace nsdc|name finds <dc:name> elements
ns|*all elements in the namespace nsdc|* finds <dc:p> and <dc:img>elements
#idelements with attribute ID of "id"div#wrap, #logo
.classelements with a class name of "class"div.left, .result
[attr]elements with an attribute named "attr" (with any value)a[href], [title]
[^attrPrefix]elements with an attribute name starting with "attrPrefix". Use to find elements with HTML5 datasets[^data-], div[^data-]
[attr=val]elements with an attribute named "attr", and value equal to "val"img[width=500], a[rel=nofollow]
[attr="val"]elements with an attribute named "attr", and value equal to "val"span[hello="Cleveland"][goodbye="Columbus"], a[rel="nofollow"]
[attr^=valPrefix]elements with an attribute named "attr", and value starting with "valPrefix"a[href^=http:]
[attr$=valSuffix]elements with an attribute named "attr", and value ending with "valSuffix"img[src$=.png]
[attr*=valContaining]elements with an attribute named "attr", and value containing "valContaining"a[href*=/search/]
[attr~=regex]elements with an attribute named "attr", and value matching the regular expressionimg[src~=(?i)\\.(png|jpe?g)]
[*]elements with any attributep[*] finds p elements that have at least one attribute; p:not([*]) finds those with no attributes
The above may be combined in any orderdiv.header[title]

Combinators

E Fan F element descended from an E elementdiv a, .logo h1
E {@literal >} Fan F direct child of Eol {@literal >} li
E + Fan F element immediately preceded by sibling Eli + li, div.head + div
E ~ Fan F element preceded by sibling Eh1 ~ p
E, F, Gall matching elements E, F, or Ga[href], div, h3

Pseudo selectors

:lt(n)elements whose sibling index is less than ntd:lt(3) finds the first 3 cells of each row
:gt(n)elements whose sibling index is greater than ntd:gt(1) finds cells after skipping the first two
:eq(n)elements whose sibling index is equal to ntd:eq(0) finds the first cell of each row
:has(selector)elements that contains at least one element matching the selectordiv:has(p) finds divs that contain p elements.
div:has(> a) selects div elements that have at least one direct child a element.
section:has(h1, h2) finds section elements that contain a h1 or a h2 element
:is(selector list)elements that match any of the selectors in the selector list:is(h1, h2, h3, h4, h5, h6) finds any heading element.
:is(section, article) > :is(h1, h2) finds a h1 or h2 that is a direct child of a section or an article
:not(selector)elements that do not match the selector. See also {@link Elements#not(String)}div:not(.logo) finds all divs that do not have the "logo" class.

div:not(:has(div)) finds divs that do not contain divs.

:contains(text)elements that contains the specified text. The search is case insensitive. The text may appear in the found element, or any of its descendants. The text is whitespace normalized.

To find content that includes parentheses, escape those with a {@code \}.

p:contains(jsoup) finds p elements containing the text "jsoup".

{@code p:contains(hello \(there\) finds p elements containing the text "Hello (There)"}

:containsOwn(text)elements that directly contain the specified text. The search is case insensitive. The text must appear in the found element, not any of its descendants.p:containsOwn(jsoup) finds p elements with own text "jsoup".
:containsData(data)elements that contains the specified data. The contents of {@code script} and {@code style} elements, and {@code comment} nodes (etc) are considered data nodes, not text nodes. The search is case insensitive. The data may appear in the found element, or any of its descendants.script:contains(jsoup) finds script elements containing the data "jsoup".
:containsWholeText(text)elements that contains the specified non-normalized text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, or any of its descendants.

To find content that includes parentheses, escape those with a {@code \}.

p:containsWholeText(jsoup\nThe Java HTML Parser) finds p elements containing the text "jsoup\nThe Java HTML Parser" (and not other variations of whitespace or casing, as :contains() would. Note that {@code br} elements are presented as a newline.

:containsWholeOwnText(text)elements that directly contain the specified non-normalized text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, but not in its descendants.

To find content that includes parentheses, escape those with a {@code \}.

p:containsWholeOwnText(jsoup\nThe Java HTML Parser) finds p elements directly containing the text "jsoup\nThe Java HTML Parser" (and not other variations of whitespace or casing, as :contains() would. Note that {@code br} elements are presented as a newline.

:matches(regex)elements containing whitespace normalized text that matches the specified regular expression. The text may appear in the found element, or any of its descendants.td:matches(\\d+) finds table cells containing digits. div:matches((?i)login) finds divs containing the text, case insensitively.
:matchesWholeText(regex)elements containing non-normalized whole text that matches the specified regular expression. The text may appear in the found element, or any of its descendants.td:matchesWholeText(\\s{2,}) finds table cells a run of at least two space characters.
:matchesWholeOwnText(regex)elements whose own non-normalized whole text matches the specified regular expression. The text must appear in the found element, not any of its descendants.td:matchesWholeOwnText(\n\\d+) finds table cells directly containing digits following a neewline.
The above may be combined in any order and with other selectors.light:contains(name):eq(0)
:matchTexttreats text nodes as elements, and so allows you to match against and select text nodes.

Note that using this selector will modify the DOM, so you may want to {@code clone} your document before using.

Deprecated. This selector is deprecated and will be removed in a future version. Migrate to ::textnode using the Element#selectNodes() method instead.

{@code p:matchText:firstChild} with input {@code

One
Two

} will return one {@link org.jsoup.nodes.PseudoTextElement} with text "{@code One}".

Structural pseudo selectors

:rootThe element that is the root of the document. In HTML, this is the html element:root
:nth-child(an+b)

elements that have an+b-1 siblings before it in the document tree, for any positive integer or zero value of n, and has a parent element. For values of a and b greater than zero, this effectively divides the element's children into groups of a elements (the last group taking the remainder), and selecting the bth element of each group. For example, this allows the selectors to address every other row in a table, and could be used to alternate the color of paragraph text in a cycle of four. The a and b values must be integers (positive, negative, or zero). The index of the first child of an element is 1.

Additionally, :nth-child() supports odd and even as arguments. odd is the same as 2n+1, and even is the same as 2n.
tr:nth-child(2n+1) finds every odd row of a table. :nth-child(10n-1) the 9th, 19th, 29th, etc, element. li:nth-child(5) the 5h li
:nth-last-child(an+b)elements that have an+b-1 siblings after it in the document tree. Otherwise like :nth-child()tr:nth-last-child(-n+2) the last two rows of a table
:nth-of-type(an+b)pseudo-class notation represents an element that has an+b-1 siblings with the same expanded element name before it in the document tree, for any zero or positive integer value of n, and has a parent elementimg:nth-of-type(2n+1)
:nth-last-of-type(an+b)pseudo-class notation represents an element that has an+b-1 siblings with the same expanded element name after it in the document tree, for any zero or positive integer value of n, and has a parent elementimg:nth-last-of-type(2n+1)
:first-childelements that are the first child of some other element.div {@literal >} p:first-child
:last-childelements that are the last child of some other element.ol {@literal >} li:last-child
:first-of-typeelements that are the first sibling of its type in the list of children of its parent elementdl dt:first-of-type
:last-of-typeelements that are the last sibling of its type in the list of children of its parent elementtr {@literal >} td:last-of-type
:only-childelements that have a parent element and whose parent element have no other element children
:only-of-type an element that has a parent element and whose parent element has no other element children with the same expanded element name
:emptyelements that contain no child elements or nodes, with the exception of blank text nodes, comments, XML declarations, and doctype declarations. In other words, it matches elements that are effectively empty of meaningful content.li:not(:empty)

Node pseudo selectors

These selectors enable matching specific leaf nodes, including Comments, TextNodes. When used with {@link Element#select(String)}, these can be used with structural selectors such as :has() to refine which Elements are matched. To retrieve matching Nodes directly, use {@Element#selectNodes(String)}.
::nodeMatches any node
::leafnodeMatches any leaf-node (this is, a Node which is not an Element)
::commentMatches a Comment node
::textMatches a TextNode
::dataMatches a DataNode (e.g. the content of a script or a style element)
::cdataMatches a CDataNode (which are only present in XML)
::node:contains(text)Matches a node that has a (normalized, case-insensitive) value containing text.::comment:contains(foo bar)
::node:matches(regex)Matches a node that has a value matching the regex.::comment:matches(\\d+)
::node:blankMatches a node that has either no value, or a value of only whitespace.::comment:not(:blank)

A word on using regular expressions in these selectors: depending on the content of the regex, you will need to quote the pattern using Pattern.quote("regex") for it to parse correctly through both the selector parser and the regex parser. E.g. String query = "div:matches(" + Pattern.quote(regex) + ");".

Escaping special characters: to match a tag, ID, or other selector that does not follow the regular CSS syntax, the query must be escaped with the \ character. For example, to match by ID {@code

}, use {@code document.select("#i\\.d")}.

@see Element#select(String css) @see Element#selectFirst(String css) @see Element#select(Evaluator eval) @see Element#selectNodes(String css) @see Element#selectNodes(String css, Class nodeType) @see Elements#select(String css) @see Element#selectXpath(String xpath) */ public class Selector { // not instantiable private Selector() {} /** Find Elements matching the CSS query. @param query CSS selector @param root root element to descend into @return matching elements, empty if none @throws Selector.SelectorParseException (unchecked) on an invalid CSS query. */ public static Elements select(String query, Element root) { Validate.notEmpty(query); return select(evaluatorOf(query), root); } /** Find Elements matching the Evaluator. @param evaluator CSS Evaluator @param root root (context) element to start from @return matching elements, empty if none */ public static Elements select(Evaluator evaluator, Element root) { Validate.notNull(evaluator); Validate.notNull(root); return Collector.collect(evaluator, root); } /** Finds a Stream of elements matching the CSS query. @param query CSS selector @param root root element to descend into @return a Stream of matching elements, empty if none @throws Selector.SelectorParseException (unchecked) on an invalid CSS query. @since 1.19.1 */ public static Stream selectStream(String query, Element root) { Validate.notEmpty(query); return selectStream(evaluatorOf(query), root); } /** Finds a Stream of elements matching the evaluator. @param evaluator CSS selector @param root root element to descend into @return matching elements, empty if none @since 1.19.1 */ public static Stream selectStream(Evaluator evaluator, Element root) { Validate.notNull(evaluator); Validate.notNull(root); return Collector.stream(evaluator, root); } /** Find elements matching the query, across multiple roots. Elements will be deduplicated (in the case of overlapping hierarchies). @param query CSS selector @param roots root elements to descend into @return matching elements, empty if none */ public static Elements select(String query, Iterable roots) { Validate.notEmpty(query); Validate.notNull(roots); Evaluator evaluator = evaluatorOf(query); Elements elements = new Elements(); HashSet seenElements = new HashSet<>(); // dedupe elements by identity, as .equals is == for (Element root : roots) { selectStream(evaluator, root) .filter(seenElements::add) .forEach(elements::add); } return elements; } // exclude set. package open so that Elements can implement .not() selector. static Elements filterOut(Collection elements, Collection outs) { Elements output = new Elements(); for (Element el : elements) { boolean found = false; for (Element out : outs) { if (el.equals(out)) { found = true; break; } } if (!found) output.add(el); } return output; } /** Find the first Element that matches the query. @param cssQuery CSS selector @param root root element to descend into @return the matching element, or null if none. */ public static @Nullable Element selectFirst(String cssQuery, Element root) { Validate.notEmpty(cssQuery); return Collector.findFirst(evaluatorOf(cssQuery), root); } /** Find the first element matching the query, across multiple roots. @param cssQuery CSS selector @param roots root elements to descend into @return the first matching element, or {@code null} if none @since 1.19.1 */ public static @Nullable Element selectFirst(String cssQuery, Iterable roots) { Validate.notEmpty(cssQuery); Validate.notNull(roots); Evaluator evaluator = evaluatorOf(cssQuery); for (Element root : roots) { Element first = Collector.findFirst(evaluator, root); if (first != null) return first; } return null; } /** Given a CSS identifier (such as a tag, ID, or class), escape any CSS special characters that would otherwise not be valid in a selector. @see CSS Object Model, serialize an identifier @since 1.20.1 */ public static String escapeCssIdentifier(String in) { return TokenQueue.escapeCssIdentifier(in); } /** Consume a CSS identifier (ID or class) off the queue.

Note: For backwards compatibility this method supports improperly formatted CSS identifiers, e.g. {@code 1} instead of {@code \31}.

@return The unescaped identifier. @throws IllegalArgumentException if an invalid escape sequence was found. @see CSS Syntax Module Level 3, Consume an ident sequence @see CSS Syntax Module Level 3, ident-token @since 1.20.1 */ public static String unescapeCssIdentifier(String in) { try (TokenQueue tq = new TokenQueue(in)) { return tq.consumeCssIdentifier(); } } /** Parse a CSS query into an Evaluator. If you are evaluating the same query repeatedly, it may be more efficient to parse it once and reuse the Evaluator. @param css CSS query @return Evaluator @see Selector selector query syntax @throws Selector.SelectorParseException if the CSS query is invalid @since 1.21.1 */ public static Evaluator evaluatorOf(String css) { return QueryParser.parse(css); } public static class SelectorParseException extends IllegalStateException { public SelectorParseException(String msg) { super(msg); } public SelectorParseException(String msg, Object... msgArgs) { super(String.format(msg, msgArgs)); } public SelectorParseException(Throwable cause, String msg, Object... msgArgs) { super(String.format(msg, msgArgs), cause); } } } ================================================ FILE: src/main/java/org/jsoup/select/StructuralEvaluator.java ================================================ package org.jsoup.select; import org.jsoup.internal.SoftPool; import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Element; import org.jsoup.nodes.LeafNode; import org.jsoup.nodes.Node; import org.jsoup.nodes.NodeIterator; import org.jsoup.nodes.TextNode; import java.util.ArrayList; import java.util.Map; import java.util.WeakHashMap; /** * Base structural evaluator. */ abstract class StructuralEvaluator extends Evaluator { final Evaluator evaluator; boolean wantsNodes; // if the evaluator requested nodes, not just elements public StructuralEvaluator(Evaluator evaluator) { this.evaluator = evaluator; wantsNodes = evaluator.wantsNodes(); } @Override boolean wantsNodes() { return wantsNodes; } // Memoize inner matches, to save repeated re-evaluations of parent, sibling etc. // root + element: Boolean matches. ThreadLocal in case the Evaluator is compiled then reused across multi threads final ThreadLocal>> threadMemo = ThreadLocal.withInitial(WeakHashMap::new); boolean memoMatches(final Element root, final Node node) { Map> rootMemo = threadMemo.get(); Map memo = rootMemo.computeIfAbsent(root, r -> new WeakHashMap<>()); return memo.computeIfAbsent(node, test -> evaluator.matches(root, test)); } @Override protected void reset() { threadMemo.remove(); evaluator.reset(); super.reset(); } @Override public boolean matches(Element root, Element element) { return evaluateMatch(root, element); } @Override boolean matches(Element root, LeafNode leafNode) { return evaluateMatch(root, leafNode); } abstract boolean evaluateMatch(Element root, Node node); static class Root extends Evaluator { @Override public boolean matches(Element root, Element element) { return root == element; } @Override protected int cost() { return 1; } @Override public String toString() { return ">"; } } static class Has extends StructuralEvaluator { static final SoftPool> NodeIterPool = new SoftPool<>(() -> new NodeIterator<>(new TextNode(""), Node.class)); // the element here is just a placeholder so this can be final - gets set in restart() private final boolean checkSiblings; // evaluating against siblings (or children) public Has(Evaluator evaluator) { super(evaluator); checkSiblings = evalWantsSiblings(evaluator); } @Override public boolean matches(Element root, Element element) { if (checkSiblings) { // evaluating against siblings for (Element sib = element.firstElementSibling(); sib != null; sib = sib.nextElementSibling()) { if (sib != element && evaluator.matches(element, sib)) { // don't match against self return true; } } } // otherwise we only want to match children (or below), and not the input element. And we want to minimize GCs so reusing the Iterator obj NodeIterator it = NodeIterPool.borrow(); it.restart(element); try { while (it.hasNext()) { Node node = it.next(); if (node == element) continue; // don't match self, only descendants if (evaluator.matches(element, node)) { return true; } } } finally { NodeIterPool.release(it); } return false; } @Override boolean evaluateMatch(Element root, Node node) { return false; // unused; :has(::comment)) goes via implicit root combinator } /* Test if the :has sub-clause wants sibling elements (vs nested elements) - will be a Combining eval */ private static boolean evalWantsSiblings(Evaluator eval) { if (eval instanceof CombiningEvaluator) { CombiningEvaluator ce = (CombiningEvaluator) eval; for (Evaluator innerEval : ce.evaluators) { if (innerEval instanceof PreviousSibling || innerEval instanceof ImmediatePreviousSibling) return true; } } return false; } @Override protected int cost() { return 10 * evaluator.cost(); } @Override public String toString() { return String.format(":has(%s)", evaluator); } } /** Implements the :is(sub-query) pseudo-selector */ static class Is extends StructuralEvaluator { public Is(Evaluator evaluator) { super(evaluator); } @Override boolean evaluateMatch(Element root, Node node) { return evaluator.matches(root, node); } @Override protected int cost() { return 2 + evaluator.cost(); } @Override public String toString() { return String.format(":is(%s)", evaluator); } } static class Not extends StructuralEvaluator { public Not(Evaluator evaluator) { super(evaluator); } @Override boolean evaluateMatch(Element root, Node node) { return !memoMatches(root, node); } @Override protected int cost() { return 2 + evaluator.cost(); } @Override public String toString() { return String.format(":not(%s)", evaluator); } } /** Any Ancestor (i.e., ascending parent chain.). */ static class Ancestor extends StructuralEvaluator { public Ancestor(Evaluator evaluator) { super(evaluator); } @Override boolean evaluateMatch(Element root, Node node) { if (root == node) return false; for (Node parent = node.parent(); parent != null; parent = parent.parent()) { if (memoMatches(root, parent)) return true; if (parent == root) break; } return false; } @Override protected int cost() { return 8 * evaluator.cost(); // probably lower than has(), but still significant, depending on doc and el depth. } @Override public String toString() { return String.format("%s ", evaluator); } } /** Holds a list of evaluators for one > two > three immediate parent matches, and the final direct evaluator under test. To match, these are effectively ANDed together, starting from the last, matching up to the first. */ static class ImmediateParentRun extends StructuralEvaluator { final ArrayList evaluators = new ArrayList<>(); int cost = 2; public ImmediateParentRun(Evaluator evaluator) { super(evaluator); evaluators.add(evaluator); cost += evaluator.cost(); } void add(Evaluator evaluator) { evaluators.add(evaluator); cost += evaluator.cost(); wantsNodes |= evaluator.wantsNodes(); } @Override boolean evaluateMatch(Element root, Node node) { if (node == root) return false; // cannot match as the second eval (first parent test) would be above the root for (int i = evaluators.size() -1; i >= 0; --i) { if (node == null) return false; Evaluator eval = evaluators.get(i); if (!eval.matches(root, node)) return false; node = node.parent(); } return true; } @Override protected int cost() { return cost; } @Override protected void reset() { for (Evaluator evaluator : evaluators) { evaluator.reset(); } super.reset(); } @Override public String toString() { return StringUtil.join(evaluators, " > "); } } static class PreviousSibling extends StructuralEvaluator { public PreviousSibling(Evaluator evaluator) { super(evaluator); } // matches any previous sibling, so can be same in Element only or wantsNodes context @Override boolean evaluateMatch(Element root, Node node) { if (root == node) return false; for (Node sib = node.firstSibling(); sib != null; sib = sib.nextSibling()) { if (sib == node) break; if (memoMatches(root, sib)) return true; } return false; } @Override protected int cost() { return 3 * evaluator.cost(); } @Override public String toString() { return String.format("%s ~ ", evaluator); } } static class ImmediatePreviousSibling extends StructuralEvaluator { public ImmediatePreviousSibling(Evaluator evaluator) { super(evaluator); } @Override boolean evaluateMatch(Element root, Node node) { if (root == node) return false; Node prev = wantsNodes ? node.previousSibling() : node.previousElementSibling(); return prev != null && memoMatches(root, prev); } @Override protected int cost() { return 2 + evaluator.cost(); } @Override public String toString() { return String.format("%s + ", evaluator); } } } ================================================ FILE: src/main/java/org/jsoup/select/package-info.java ================================================ /** Packages to support the CSS-style element selector. {@link org.jsoup.select.Selector Selector defines the query syntax.} */ @NullMarked package org.jsoup.select; import org.jspecify.annotations.NullMarked; ================================================ FILE: src/main/java11/module-info.java ================================================ module org.jsoup { exports org.jsoup; exports org.jsoup.helper; exports org.jsoup.nodes; exports org.jsoup.parser; exports org.jsoup.safety; exports org.jsoup.select; requires transitive java.xml; // for org.w3c.dom out of W3CDom requires static org.jspecify; // nullability annotations requires static java.net.http; // HttpClient on Java 11; guarded } ================================================ FILE: src/main/java11/org/jsoup/helper/HttpClientExecutor.java ================================================ package org.jsoup.helper; import org.jsoup.Connection; import org.jspecify.annotations.Nullable; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.Proxy; import java.net.ProxySelector; import java.net.SocketAddress; import java.net.URI; import java.net.URISyntaxException; import java.net.http.HttpClient; import java.net.http.HttpHeaders; import java.net.http.HttpRequest; import java.net.http.HttpResponse; import java.time.Duration; import java.util.ArrayList; import java.util.Collections; import java.util.List; import static org.jsoup.helper.HttpConnection.Response; import static org.jsoup.helper.HttpConnection.Response.writePost; /** Executes requests using the HttpClient, for http/2 support. Enabled by default when available. To disable, set property {@code jsoup.useHttpClient} to {@code false}. */ class HttpClientExecutor extends RequestExecutor { // HttpClient expects proxy settings per client; we do per request, so held as a thread local. Can't do same for // auth because that callback is on a worker thread, so can only do auth per Connection. So we create a new client // if the authenticator is different between requests static ThreadLocal<@Nullable Proxy> perRequestProxy = new ThreadLocal<>(); @Nullable HttpResponse hRes; public HttpClientExecutor(HttpConnection.Request request, HttpConnection.@Nullable Response previousResponse) { super(request, previousResponse); } /** Retrieve the HttpClient from the Connection, or create a new one. Allows for connection pooling of requests in the same Connection (session). */ HttpClient client() { // we try to reuse the same Client across requests in a given Connection; but if the request's auth or ssl context have changed, we need to create a new client if (req.connection.client != null) { HttpClient client = (HttpClient) req.connection.client; boolean reuse = true; RequestAuthenticator prevAuth = req.connection.lastAuth; req.connection.lastAuth = req.authenticator; if (prevAuth != req.authenticator) // might both be null reuse = false; if (req.sslContext != null && !(client.sslContext() == req.sslContext)) // client returns default context if not otherwise set reuse = false; if (reuse) return client; } HttpClient.Builder builder = HttpClient.newBuilder(); builder.followRedirects(HttpClient.Redirect.NEVER); // customized redirects builder.proxy(new ProxyWrap()); // thread local impl for per request; called on executing thread if (req.authenticator != null) builder.authenticator(new AuthenticationHandler(req.authenticator)); if (req.sslContext != null) builder.sslContext(req.sslContext); HttpClient client = builder.build(); req.connection.client = client; return client; } @Override HttpConnection.Response execute() throws IOException { try { HttpRequest.Builder reqBuilder = HttpRequest.newBuilder(req.url.toURI()).method(req.method.name(), requestBody(req)); if (req.timeout() > 0) reqBuilder.timeout( Duration.ofMillis(req.timeout())); // infinite if unset (UrlConnection / jsoup uses 0 for same) CookieUtil.applyCookiesToRequest(req, reqBuilder::header); // headers: req.multiHeaders().forEach((key, values) -> { values.forEach(value -> reqBuilder.header(key, value)); }); if (req.proxy() != null) perRequestProxy.set(req.proxy()); // set up per request proxy HttpRequest hReq = reqBuilder.build(); HttpClient client = client(); hRes = client.send(hReq, HttpResponse.BodyHandlers.ofInputStream()); HttpHeaders headers = hRes.headers(); // set up the response Response res = new Response(req); res.executor = this; res.method = Connection.Method.valueOf(hRes.request().method()); res.url = hRes.uri().toURL(); res.statusCode = hRes.statusCode(); res.statusMessage = StatusMessage(res.statusCode); res.contentType = headers.firstValue("content-type").orElse(""); long length = headers.firstValueAsLong("content-length").orElse(-1); res.contentLength = length < Integer.MAX_VALUE ? (int) length : -1; res.prepareResponse(headers.map(), prevRes); return res; } catch (IOException e) { safeClose(); throw e; } catch (InterruptedException e) { safeClose(); Thread.currentThread().interrupt(); throw new IOException(e); } catch (URISyntaxException e) { throw new IllegalArgumentException("Malformed URL: " + req.url, e); } finally { // detach per request proxy perRequestProxy.remove(); } } /** As HTTP/2 no longer provides a server-set status message, and HttpClient doesn't parse it for 1.1, just provide minimal stock ones, for loggers. */ static String StatusMessage(int statusCode) { if (statusCode < 400) return "OK"; if (statusCode == 404) return "Not Found"; return "Error " + statusCode; } @Override InputStream responseBody() throws IOException { if (hRes == null) throw new IllegalStateException("Not yet executed"); return hRes.body(); } @Override void safeClose() { if (hRes != null) { InputStream body = hRes.body(); if (body != null) { try { body.close(); } catch (IOException ignored) {} } hRes = null; } } static HttpRequest.BodyPublisher requestBody(final HttpConnection.Request req) throws IOException { if (req.method.hasBody()) { ByteArrayOutputStream buf = new ByteArrayOutputStream(); writePost(req, buf); return HttpRequest.BodyPublishers.ofByteArray(buf.toByteArray()); } else { return HttpRequest.BodyPublishers.noBody(); } } static class ProxyWrap extends ProxySelector { // empty list for no proxy: static final List NoProxy = new ArrayList<>(0); @Override public List select(URI uri) { Proxy proxy = perRequestProxy.get(); if (proxy != null) { return Collections.singletonList(proxy); } ProxySelector defaultSelector = ProxySelector.getDefault(); if (defaultSelector != null && defaultSelector != this) { // avoid recursion if we were set as default return defaultSelector.select(uri); } return NoProxy; } @Override public void connectFailed(URI uri, SocketAddress sa, IOException ioe) { if (perRequestProxy.get() != null) { return; // no-op } ProxySelector defaultSelector = ProxySelector.getDefault(); if (defaultSelector != null && defaultSelector != this) { defaultSelector.connectFailed(uri, sa, ioe); } } } } ================================================ FILE: src/main/java11/org/jsoup/helper/RequestAuthHandler.java ================================================ package org.jsoup.helper; import java.net.HttpURLConnection; import java.net.http.HttpClient; /** A per-request authentication shim, used in Java 9+. */ class RequestAuthHandler implements AuthenticationHandler.AuthShim { public RequestAuthHandler() {} @Override public void enable(RequestAuthenticator auth, Object connOrHttp) { AuthenticationHandler authenticator = new AuthenticationHandler(auth); // this is a bit ugly, but a simple way to support setting authentication on both urlconnection and httpclient without more multi-version shims if (connOrHttp instanceof HttpURLConnection) { HttpURLConnection conn = (HttpURLConnection) connOrHttp; conn.setAuthenticator(authenticator); } else if (connOrHttp instanceof HttpClient.Builder) { HttpClient.Builder builder = (HttpClient.Builder) connOrHttp; builder.authenticator(authenticator); } else { throw new IllegalArgumentException("Unsupported executor: " + connOrHttp.getClass().getName()); } } @Override public void remove() { // noop; would remove thread-local in Global Handler } @Override public AuthenticationHandler get(AuthenticationHandler helper) { // would get thread-local in Global Handler return helper; } } ================================================ FILE: src/main/javadoc/overview.html ================================================ jsoup Javadoc overview

jsoup: Java HTML parser that makes sense of real-world HTML soup.

jsoup is a Java library for working with real-world HTML. It provides a very convenient API for fetching URLs and extracting and manipulating data, using the best of HTML5 DOM methods and CSS selectors.

jsoup implements the WHATWG HTML specification, and parses HTML to the same DOM as modern browsers do.

  • parse HTML from a URL, file, or string
  • find and extract data, using DOM traversal or CSS selectors
  • manipulate the HTML elements, attributes, and text
  • clean user-submitted content against a safelist, to prevent XSS
  • output tidy HTML

jsoup is designed to deal with all varieties of HTML found in the wild; from pristine and validating, to invalid tag-soup; jsoup will create a sensible parse tree.

See jsoup.org for downloads, documentation, and examples.

@author Jonathan Hedley ================================================ FILE: src/main/resources/META-INF/proguard/org.jsoup_jsoup.pro ================================================ -dontwarn com.google.re2j.** ================================================ FILE: src/test/java/org/jsoup/JsoupTest.java ================================================ package org.jsoup; import org.jsoup.integration.ParseTest; import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; import org.junit.jupiter.api.Test; import java.io.IOException; import java.nio.file.Path; import static org.junit.jupiter.api.Assertions.assertEquals; public class JsoupTest { // Tests for the Jsoup class. Mostly for code coverage for methods that haven't been covered elsewhere already. @Test void parseWithPath() throws IOException { // parse(Path path, @Nullable String charsetName, String baseUri) Path path = ParseTest.getPath("/htmltests/medium.html"); Document doc = Jsoup.parse(path, "UTF-8", "https://example.com/"); String title = "Medium HTML"; assertEquals(title, doc.title()); // parse(Path path) doc = Jsoup.parse(path); assertEquals(title, doc.title()); // (Path path, @Nullable String charsetName, String baseUri, Parser parser) doc = Jsoup.parse(path, "UTF-8", "https://example.com/", Parser.htmlParser()); assertEquals(title, doc.title()); } } ================================================ FILE: src/test/java/org/jsoup/MultiLocaleExtension.java ================================================ package org.jsoup; import org.junit.jupiter.api.extension.AfterEachCallback; import org.junit.jupiter.api.extension.ExtendWith; import org.junit.jupiter.api.extension.ExtensionContext; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.ArgumentsProvider; import org.junit.jupiter.params.provider.ArgumentsSource; import java.lang.annotation.*; import java.util.Locale; import java.util.stream.Stream; public class MultiLocaleExtension implements AfterEachCallback, ArgumentsProvider { private final Locale defaultLocale = Locale.getDefault(); @Override public void afterEach(ExtensionContext context) { Locale.setDefault(defaultLocale); } @Override public Stream provideArguments(ExtensionContext extensionContext) { return Stream.of(Arguments.of(Locale.ENGLISH), Arguments.arguments(new Locale("tr"))); } @Documented @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) @ArgumentsSource(MultiLocaleExtension.class) @ExtendWith(MultiLocaleExtension.class) @ParameterizedTest public @interface MultiLocaleTest { } } ================================================ FILE: src/test/java/org/jsoup/SerializationExceptionTest.java ================================================ package org.jsoup; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; public class SerializationExceptionTest { @Test void constructors() { SerializationException e = new SerializationException("message"); assertEquals("message", e.getMessage()); assertNull(e.getCause()); SerializationException e2 = new SerializationException("message", new Exception("cause")); assertEquals("message", e2.getMessage()); assertEquals("cause", e2.getCause().getMessage()); SerializationException e3 = new SerializationException(); assertNull(e3.getMessage()); assertNull(e3.getCause()); } } ================================================ FILE: src/test/java/org/jsoup/TextUtil.java ================================================ package org.jsoup; import java.util.regex.Pattern; /** Text utils to ease testing @author Jonathan Hedley, jonathan@hedley.net */ public class TextUtil { static Pattern stripper = Pattern.compile("\\r?\\n\\s*"); static Pattern stripLines = Pattern.compile("\\r?\\n?"); static Pattern spaceCollapse = Pattern.compile("\\s{2,}"); static Pattern tagSpaceCollapse = Pattern.compile(">\\s+<"); static Pattern stripCRs = Pattern.compile("\\r*"); public static String stripNewlines(String text) { return stripper.matcher(text).replaceAll(""); } public static String normalizeSpaces(String text) { text = stripLines.matcher(text).replaceAll(""); text = stripper.matcher(text).replaceAll(""); text = spaceCollapse.matcher(text).replaceAll(" "); text = tagSpaceCollapse.matcher(text).replaceAll("><"); return text; } public static String stripCRs(String text) { return stripCRs.matcher(text).replaceAll(""); } } ================================================ FILE: src/test/java/org/jsoup/helper/AuthenticationHandlerTest.java ================================================ package org.jsoup.helper; public class AuthenticationHandlerTest { public static final int MaxAttempts = AuthenticationHandler.MaxAttempts; // tests are in ConnectionTest, ProxyTest. This class just makes the MaxAttempts visible for test. } ================================================ FILE: src/test/java/org/jsoup/helper/CookieUtilTest.java ================================================ package org.jsoup.helper; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; class CookieUtilTest { @Test void parseCookie() { HttpConnection.Response res = new HttpConnection.Response(); CookieUtil.parseCookie("foo=bar qux; Domain=.example.com; Path=/; Secure", res); CookieUtil.parseCookie("bar=foo qux", res); CookieUtil.parseCookie("=bar; Domain=.example.com; Path=/; Secure", res); CookieUtil.parseCookie("; Domain=.example.com; Path=/", res); CookieUtil.parseCookie("", res); CookieUtil.parseCookie(null, res); assertEquals(3, res.cookies().size()); assertEquals("bar qux", res.cookies.get("foo")); assertEquals("foo qux", res.cookies.get("bar")); assertEquals(".example.com", res.cookies.get("; Domain")); // no actual cookie name or val } } ================================================ FILE: src/test/java/org/jsoup/helper/DataUtilTest.java ================================================ package org.jsoup.helper; import org.jsoup.Jsoup; import org.jsoup.integration.ParseTest; import org.jsoup.internal.ControllableInputStream; import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; import org.jsoup.parser.StreamParser; import org.junit.jupiter.api.Test; import java.io.*; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import static org.jsoup.integration.ParseTest.getFile; import static org.jsoup.integration.ParseTest.getPath; import static org.junit.jupiter.api.Assertions.*; public class DataUtilTest { @Test public void testCharset() { assertEquals("utf-8", DataUtil.getCharsetFromContentType("text/html;charset=utf-8 ")); assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html; charset=UTF-8")); assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=ISO-8859-1")); assertNull(DataUtil.getCharsetFromContentType("text/html")); assertNull(DataUtil.getCharsetFromContentType(null)); assertNull(DataUtil.getCharsetFromContentType("text/html;charset=Unknown")); } @Test public void testQuotedCharset() { assertEquals("utf-8", DataUtil.getCharsetFromContentType("text/html; charset=\"utf-8\"")); assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html;charset=\"UTF-8\"")); assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=\"ISO-8859-1\"")); assertNull(DataUtil.getCharsetFromContentType("text/html; charset=\"Unsupported\"")); assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html; charset='UTF-8'")); } private ControllableInputStream stream(String data) { return ControllableInputStream.wrap(new ByteArrayInputStream(data.getBytes(StandardCharsets.UTF_8)), 0); } private ControllableInputStream stream(String data, String charset) { return ControllableInputStream.wrap(new ByteArrayInputStream(data.getBytes(Charset.forName(charset))), 0); } @Test public void discardsSpuriousByteOrderMark() throws IOException { String html = "\uFEFFOneTwo"; Document doc = DataUtil.parseInputStream(stream(html), "UTF-8", "http://foo.com/", Parser.htmlParser()); assertEquals("One", doc.head().text()); } @Test public void discardsSpuriousByteOrderMarkWhenNoCharsetSet() throws IOException { String html = "\uFEFFOneTwo"; Document doc = DataUtil.parseInputStream(stream(html), null, "http://foo.com/", Parser.htmlParser()); assertEquals("One", doc.head().text()); assertEquals("UTF-8", doc.outputSettings().charset().displayName()); } @Test public void shouldNotThrowExceptionOnEmptyCharset() { assertNull(DataUtil.getCharsetFromContentType("text/html; charset=")); assertNull(DataUtil.getCharsetFromContentType("text/html; charset=;")); } @Test public void shouldSelectFirstCharsetOnWeirdMultileCharsetsInMetaTags() { assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=ISO-8859-1, charset=1251")); } @Test public void shouldCorrectCharsetForDuplicateCharsetString() { assertEquals("iso-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=charset=iso-8859-1")); } @Test public void shouldReturnNullForIllegalCharsetNames() { assertNull(DataUtil.getCharsetFromContentType("text/html; charset=$HJKDF§$/(")); } @Test public void generatesMimeBoundaries() { String m1 = DataUtil.mimeBoundary(); String m2 = DataUtil.mimeBoundary(); assertEquals(DataUtil.boundaryLength, m1.length()); assertEquals(DataUtil.boundaryLength, m2.length()); assertNotSame(m1, m2); } @Test public void wrongMetaCharsetFallback() throws IOException { String html = ""; Document doc = DataUtil.parseInputStream(stream(html), null, "http://example.com", Parser.htmlParser()); final String expected = "\n" + " \n" + " \n" + " \n" + " \n" + ""; assertEquals(expected, doc.toString()); } @Test public void secondMetaElementWithContentTypeContainsCharsetParameter() throws Exception { String html = "" + "" + "" + "한국어"; Document doc = DataUtil.parseInputStream(stream(html, "euc-kr"), null, "http://example.com", Parser.htmlParser()); assertEquals("한국어", doc.body().text()); } @Test public void firstMetaElementWithCharsetShouldBeUsedForDecoding() throws Exception { String html = "" + "" + "" + "Übergrößenträger"; Document doc = DataUtil.parseInputStream(stream(html, "iso-8859-1"), null, "http://example.com", Parser.htmlParser()); assertEquals("Übergrößenträger", doc.body().text()); } @Test public void parseSequenceInputStream() throws IOException { // https://github.com/jhy/jsoup/pull/1671 File in = getFile("/htmltests/medium.html"); String fileContent = new String(Files.readAllBytes(in.toPath())); int halfLength = fileContent.length() / 2; String firstPart = fileContent.substring(0, halfLength); String secondPart = fileContent.substring(halfLength); SequenceInputStream sequenceStream = new SequenceInputStream( stream(firstPart), stream(secondPart) ); ControllableInputStream stream = ControllableInputStream.wrap(sequenceStream, 0); Document doc = DataUtil.parseInputStream(stream, null, "", Parser.htmlParser()); assertEquals(fileContent, doc.outerHtml()); } @Test public void supportsBOMinFiles() throws IOException { // test files from http://www.i18nl10n.com/korean/utftest/ File in = getFile("/bomtests/bom_utf16be.html"); Document doc = Jsoup.parse(in, null, "http://example.com"); assertTrue(doc.title().contains("UTF-16BE")); assertTrue(doc.text().contains("가각갂갃간갅")); in = getFile("/bomtests/bom_utf16le.html"); doc = Jsoup.parse(in, null, "http://example.com"); assertTrue(doc.title().contains("UTF-16LE")); assertTrue(doc.text().contains("가각갂갃간갅")); in = getFile("/bomtests/bom_utf32be.html"); doc = Jsoup.parse(in, null, "http://example.com"); assertTrue(doc.title().contains("UTF-32BE")); assertTrue(doc.text().contains("가각갂갃간갅")); in = getFile("/bomtests/bom_utf32le.html"); doc = Jsoup.parse(in, null, "http://example.com"); assertTrue(doc.title().contains("UTF-32LE")); assertTrue(doc.text().contains("가각갂갃간갅")); } @Test public void streamerSupportsBOMinFiles() throws IOException { // test files from http://www.i18nl10n.com/korean/utftest/ Path in = getFile("/bomtests/bom_utf16be.html").toPath(); Parser parser = Parser.htmlParser(); Document doc = DataUtil.streamParser(in, null, "http://example.com", parser).complete(); assertTrue(doc.title().contains("UTF-16BE")); assertTrue(doc.text().contains("가각갂갃간갅")); in = getFile("/bomtests/bom_utf16le.html").toPath(); doc = DataUtil.streamParser(in, null, "http://example.com", parser).complete(); assertTrue(doc.title().contains("UTF-16LE")); assertTrue(doc.text().contains("가각갂갃간갅")); in = getFile("/bomtests/bom_utf32be.html").toPath(); doc = DataUtil.streamParser(in, null, "http://example.com", parser).complete(); assertTrue(doc.title().contains("UTF-32BE")); assertTrue(doc.text().contains("가각갂갃간갅")); in = getFile("/bomtests/bom_utf32le.html").toPath(); doc = DataUtil.streamParser(in, null, "http://example.com", parser).complete(); assertTrue(doc.title().contains("UTF-32LE")); assertTrue(doc.text().contains("가각갂갃간갅")); } @Test public void supportsUTF8BOM() throws IOException { File in = getFile("/bomtests/bom_utf8.html"); Document doc = Jsoup.parse(in, null, "http://example.com"); assertEquals("OK", doc.head().select("title").text()); } @Test public void noExtraNULLBytes() throws IOException { final byte[] b = "
üü
".getBytes(StandardCharsets.UTF_8); Document doc = Jsoup.parse(new ByteArrayInputStream(b), null, ""); assertFalse( doc.outerHtml().contains("\u0000") ); } @Test public void supportsZippedUTF8BOM() throws IOException { File in = getFile("/bomtests/bom_utf8.html.gz"); Document doc = Jsoup.parse(in, null, "http://example.com"); assertEquals("OK", doc.head().select("title").text()); assertEquals("There is a UTF8 BOM at the top (before the XML decl). If not read correctly, will look like a non-joining space.", doc.body().text()); } @Test public void streamerSupportsZippedUTF8BOM() throws IOException { Path in = getFile("/bomtests/bom_utf8.html.gz").toPath(); Document doc = DataUtil.streamParser(in, null, "http://example.com", Parser.htmlParser()).complete(); assertEquals("OK", doc.head().select("title").text()); assertEquals("There is a UTF8 BOM at the top (before the XML decl). If not read correctly, will look like a non-joining space.", doc.body().text()); } @Test public void supportsXmlCharsetDeclaration() throws IOException { String encoding = "iso-8859-1"; InputStream soup = new ByteArrayInputStream(( "" + "" + "Hellö Wörld!" ).getBytes(Charset.forName(encoding))); Document doc = Jsoup.parse(soup, null, ""); assertEquals("Hellö Wörld!", doc.body().text()); } @Test public void loadsGzipFile() throws IOException { File in = getFile("/htmltests/gzip.html.gz"); Document doc = Jsoup.parse(in, null); assertEquals("Gzip test", doc.title()); assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text()); } @Test public void loadsGzipPath() throws IOException { Path in = getPath("/htmltests/gzip.html.gz"); Document doc = Jsoup.parse(in, null); assertEquals("Gzip test", doc.title()); assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text()); } @Test public void loadsZGzipFile() throws IOException { // compressed on win, with z suffix File in = getFile("/htmltests/gzip.html.z"); Document doc = Jsoup.parse(in, null); assertEquals("Gzip test", doc.title()); assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text()); } @Test public void loadsZGzipPath() throws IOException { // compressed on win, with z suffix Path in = getPath("/htmltests/gzip.html.z"); Document doc = Jsoup.parse(in, null); assertEquals("Gzip test", doc.title()); assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text()); } @Test public void handlesFakeGzipFile() throws IOException { File in = getFile("/htmltests/fake-gzip.html.gz"); Document doc = Jsoup.parse(in, null); assertEquals("This is not gzipped", doc.title()); assertEquals("And should still be readable.", doc.selectFirst("p").text()); } @Test public void handlesFakeGzipPath() throws IOException { Path in = getPath("/htmltests/fake-gzip.html.gz"); Document doc = Jsoup.parse(in, null); assertEquals("This is not gzipped", doc.title()); assertEquals("And should still be readable.", doc.selectFirst("p").text()); } // an input stream to give a range of output sizes, that changes on each read static class VaryingReadInputStream extends InputStream { final InputStream in; int stride = 0; VaryingReadInputStream(InputStream in) { this.in = in; } public int read() throws IOException { return in.read(); } public int read(byte[] b) throws IOException { return in.read(b, 0, Math.min(b.length, ++stride)); } public int read(byte[] b, int off, int len) throws IOException { return in.read(b, off, Math.min(len, ++stride)); } } @Test void handlesChunkedInputStream() throws IOException { File inputFile = ParseTest.getFile("/htmltests/large.html"); String input = ParseTest.getFileAsString(inputFile); VaryingReadInputStream stream = new VaryingReadInputStream(ParseTest.inputStreamFrom(input)); Document expected = Jsoup.parse(input, "https://example.com"); Document doc = Jsoup.parse(stream, null, "https://example.com"); assertTrue(doc.hasSameValue(expected)); } @Test void handlesUnlimitedRead() throws IOException { File inputFile = ParseTest.getFile("/htmltests/large.html"); String input = ParseTest.getFileAsString(inputFile); VaryingReadInputStream stream = new VaryingReadInputStream(ParseTest.inputStreamFrom(input)); ByteBuffer byteBuffer = DataUtil.readToByteBuffer(stream, 0); String read = new String(byteBuffer.array(), 0, byteBuffer.limit(), StandardCharsets.UTF_8); assertEquals(input, read); } @Test void controllableInputStreamAllowsNull() throws IOException { ControllableInputStream is = ControllableInputStream.wrap(null, 0); assertNotNull(is); assertTrue(is.baseReadFully()); is.close(); } @Test void streamParserSurrogateAcrossBuffer() throws IOException { // https://github.com/jhy/jsoup/issues/2353 try (StreamParser parser = DataUtil.streamParser(ParseTest.getPath("/fuzztests/2353.html.gz"), DataUtil.UTF_8, "", Parser.htmlParser())) { Document doc = parser.complete(); String html = doc.html(); assertTrue(html.contains("Read-Fully!")); } } @Test void parseSurrogateAcrossBuffer() throws IOException { Document doc = Jsoup.parse(ParseTest.getPath("/fuzztests/2353.html.gz")); assertTrue(doc.html().contains("Read-Fully!")); } @Test void charsetSniffingCanReuseTruncatedPreParse() throws IOException { // #2448: when available() reports buffered bytes after the first read, the sniffed pre-parse may be reused while capped, leading to truncation StringBuilder sb = new StringBuilder(); sb.append("t
");
        while (sb.length() < 6200) {
            sb.append("0123456789 abcdefghijklmnopqrstuvwxyz\n");
        }
        sb.append("
list

"); String html = sb.toString(); byte[] bytes = html.getBytes(StandardCharsets.UTF_8); ControllableInputStream in = ControllableInputStream.wrap(new BufferedOnceAvailableStream(bytes), 0); DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(in, null, "http://example.com/", Parser.htmlParser()); Document doc = DataUtil.parseInputStream(charsetDoc, "http://example.com/", Parser.htmlParser()); assertNotNull(doc.selectFirst("hr"), "hr should survive the sniff + full parse"); } // delivers all bytes in the first read, then signals available()>0 once to trigger a second read and baseReadFully=true static final class BufferedOnceAvailableStream extends InputStream { private final byte[] data; private int pos = 0; private boolean extraSignal = true; BufferedOnceAvailableStream(byte[] data) { this.data = data; } @Override public int read(byte[] b, int off, int len) { if (pos >= data.length) return -1; int take = Math.min(len, data.length - pos); System.arraycopy(data, pos, b, off, take); pos += take; return take; } @Override public int read() { return pos < data.length ? (data[pos++] & 0xff) : -1; } @Override public int available() { if (pos < data.length) return data.length - pos; if (extraSignal) { extraSignal = false; return 1; // nudge SimpleBufferedInput.fill() to try another read } return 0; } } @Test void charsetSniffingIgnoresAdvisoryAvailableIOException() throws IOException { // https://github.com/jhy/jsoup/issues/2474 // JDK 8's HttpURLConnection stream may throw from available() once the peer has closed the socket; // that advisory failure does not mean we can't still consume bytes already buffered or read to clean EOF. String html = "OneTwo"; byte[] bytes = html.getBytes(StandardCharsets.UTF_8); InputStream stream = new FilterInputStream(new ByteArrayInputStream(bytes)) { @Override public int available() throws IOException { throw new IOException("Stream closed."); } }; ControllableInputStream in = ControllableInputStream.wrap(stream, 0); Document doc = DataUtil.parseInputStream(in, null, "http://example.com/", Parser.htmlParser()); assertEquals("One", doc.title()); assertEquals("Two", doc.body().text()); } } ================================================ FILE: src/test/java/org/jsoup/helper/HttpConnectionTest.java ================================================ package org.jsoup.helper; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.MultiLocaleExtension.MultiLocaleTest; import org.jsoup.integration.ParseTest; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.ValueSource; import java.io.IOException; import java.net.Authenticator; import java.net.MalformedURLException; import java.net.PasswordAuthentication; import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; import static org.jsoup.helper.HttpConnection.Response.fixHeaderEncoding; import static org.junit.jupiter.api.Assertions.*; public class HttpConnectionTest { /* most actual network http connection tests are in integration */ @Test public void canCreateEmptyConnection() { HttpConnection con = new HttpConnection(); assertEquals(Connection.Method.GET, con.request().method()); assertThrows(IllegalArgumentException.class, () -> { URL url = con.request().url(); }); } @Test public void throwsExceptionOnResponseWithoutExecute() { assertThrows(IllegalArgumentException.class, () -> { Connection con = HttpConnection.connect("http://example.com"); con.response(); }); } @Test public void throwsExceptionOnParseWithoutExecute() { assertThrows(IllegalArgumentException.class, () -> { Connection con = HttpConnection.connect("http://example.com"); con.response().parse(); }); } @Test public void throwsExceptionOnBodyWithoutExecute() { assertThrows(IllegalArgumentException.class, () -> { Connection con = HttpConnection.connect("http://example.com"); con.response().body(); }); } @Test public void throwsExceptionOnBodyAsBytesWithoutExecute() { assertThrows(IllegalArgumentException.class, () -> { Connection con = HttpConnection.connect("http://example.com"); con.response().bodyAsBytes(); }); } @MultiLocaleTest public void caseInsensitiveHeaders(Locale locale) { Locale.setDefault(locale); Connection.Response res = new HttpConnection.Response(); res.header("Accept-Encoding", "gzip"); res.header("content-type", "text/html"); res.header("refErrer", "http://example.com"); assertTrue(res.hasHeader("Accept-Encoding")); assertTrue(res.hasHeader("accept-encoding")); assertTrue(res.hasHeader("accept-Encoding")); assertTrue(res.hasHeader("ACCEPT-ENCODING")); assertEquals("gzip", res.header("accept-Encoding")); assertEquals("gzip", res.header("ACCEPT-ENCODING")); assertEquals("text/html", res.header("Content-Type")); assertEquals("http://example.com", res.header("Referrer")); res.removeHeader("Content-Type"); assertFalse(res.hasHeader("content-type")); res.removeHeader("ACCEPT-ENCODING"); assertFalse(res.hasHeader("Accept-Encoding")); res.header("ACCEPT-ENCODING", "deflate"); assertEquals("deflate", res.header("Accept-Encoding")); assertEquals("deflate", res.header("accept-Encoding")); } @Test public void headers() { Connection con = HttpConnection.connect("http://example.com"); Map headers = new HashMap<>(); headers.put("content-type", "text/html"); headers.put("Connection", "keep-alive"); headers.put("Host", "http://example.com"); con.headers(headers); assertEquals("text/html", con.request().header("content-type")); assertEquals("keep-alive", con.request().header("Connection")); assertEquals("http://example.com", con.request().header("Host")); } @Test public void sameHeadersCombineWithComma() { Map> headers = new HashMap<>(); List values = new ArrayList<>(); values.add("no-cache"); values.add("no-store"); headers.put("Cache-Control", values); HttpConnection.Response res = new HttpConnection.Response(); res.processResponseHeaders(headers); assertEquals("no-cache, no-store", res.header("Cache-Control")); } @Test public void multipleHeaders() { Connection.Request req = new HttpConnection.Request(); req.addHeader("Accept", "Something"); req.addHeader("Accept", "Everything"); req.addHeader("Foo", "Bar"); assertTrue(req.hasHeader("Accept")); assertTrue(req.hasHeader("ACCEpt")); assertEquals("Something, Everything", req.header("accept")); assertTrue(req.hasHeader("fOO")); assertEquals("Bar", req.header("foo")); List accept = req.headers("accept"); assertEquals(2, accept.size()); assertEquals("Something", accept.get(0)); assertEquals("Everything", accept.get(1)); Map> headers = req.multiHeaders(); assertEquals(accept, headers.get("Accept")); assertEquals("Bar", headers.get("Foo").get(0)); assertTrue(req.hasHeader("Accept")); assertTrue(req.hasHeaderWithValue("accept", "Something")); assertTrue(req.hasHeaderWithValue("accept", "Everything")); assertFalse(req.hasHeaderWithValue("accept", "Something for nothing")); req.removeHeader("accept"); headers = req.multiHeaders(); assertEquals("Bar", headers.get("Foo").get(0)); assertFalse(req.hasHeader("Accept")); assertNull(headers.get("Accept")); } @Test void responseHeadersPreserveInsertOrder() throws IOException { // linkedhashmap preserves the response header order Connection.Response res = new HttpConnection.Response(); res.addHeader("5", "5"); res.addHeader("4", "4"); res.addHeader("3", "3"); res.addHeader("2", "2"); res.addHeader("1", "1"); String[] expected = {"5", "4", "3", "2", "1"}; int i = 0; for (String key : res.headers().keySet()) { assertEquals(expected[i++], key); } assertInstanceOf(LinkedHashMap.class, res.headers()); } @Test public void ignoresEmptySetCookies() { // prep http response header map Map> headers = new HashMap<>(); headers.put("Set-Cookie", Collections.emptyList()); HttpConnection.Response res = new HttpConnection.Response(); res.processResponseHeaders(headers); assertEquals(0, res.cookies().size()); } @Test public void connectWithUrl() throws MalformedURLException { Connection con = HttpConnection.connect(new URL("http://example.com")); assertEquals("http://example.com", con.request().url().toExternalForm()); } @Test public void throwsOnMalformedUrl() { assertThrows(IllegalArgumentException.class, () -> HttpConnection.connect("bzzt")); } @Test public void userAgent() { Connection con = HttpConnection.connect("http://example.com/"); assertEquals(HttpConnection.DEFAULT_UA, con.request().header("User-Agent")); con.userAgent("Mozilla"); assertEquals("Mozilla", con.request().header("User-Agent")); } @Test public void timeout() { Connection con = HttpConnection.connect("http://example.com/"); assertEquals(30 * 1000, con.request().timeout()); con.timeout(1000); assertEquals(1000, con.request().timeout()); } @Test public void referrer() { Connection con = HttpConnection.connect("http://example.com/"); con.referrer("http://foo.com"); assertEquals("http://foo.com", con.request().header("Referer")); } @Test public void method() { Connection con = HttpConnection.connect("http://example.com/"); assertEquals(Connection.Method.GET, con.request().method()); con.method(Connection.Method.POST); assertEquals(Connection.Method.POST, con.request().method()); } @Test public void throwsOnOddData() { assertThrows(IllegalArgumentException.class, () -> { Connection con = HttpConnection.connect("http://example.com/"); con.data("Name", "val", "what"); }); } @Test public void data() { Connection con = HttpConnection.connect("http://example.com/"); con.data("Name", "Val", "Foo", "bar"); Collection values = con.request().data(); Object[] data = values.toArray(); Connection.KeyVal one = (Connection.KeyVal) data[0]; Connection.KeyVal two = (Connection.KeyVal) data[1]; assertEquals("Name", one.key()); assertEquals("Val", one.value()); assertEquals("Foo", two.key()); assertEquals("bar", two.value()); } @Test public void cookie() { Connection con = HttpConnection.connect("http://example.com/"); con.cookie("Name", "Val"); assertEquals("Val", con.request().cookie("Name")); } @Test public void inputStream() { Connection.KeyVal kv = HttpConnection.KeyVal.create("file", "thumb.jpg", ParseTest.inputStreamFrom("Check")); assertEquals("file", kv.key()); assertEquals("thumb.jpg", kv.value()); assertTrue(kv.hasInputStream()); kv = HttpConnection.KeyVal.create("one", "two"); assertEquals("one", kv.key()); assertEquals("two", kv.value()); assertFalse(kv.hasInputStream()); } @Test public void requestBody() { Connection con = HttpConnection.connect("http://example.com/"); con.requestBody("foo"); assertEquals("foo", con.request().requestBody()); } @Test public void encodeUrl() throws MalformedURLException { URL url1 = new URL("https://test.com/foo%20bar/%5BOne%5D?q=white+space#frag"); URL url2 = new UrlBuilder(url1).build(); assertEquals("https://test.com/foo%20bar/%5BOne%5D?q=white+space#frag", url2.toExternalForm()); } @Test public void encodeUrlSupplementary() throws MalformedURLException { URL url1 = new URL("https://example.com/tools/test💩.html"); // = "/tools/test\uD83D\uDCA9.html" URL url2 = new UrlBuilder(url1).build(); assertEquals("https://example.com/tools/test%F0%9F%92%A9.html", url2.toExternalForm()); } @Test void encodedUrlDoesntDoubleEncode() throws MalformedURLException { URL url1 = new URL("https://test.com/foo%20bar/%5BOne%5D?q=white+space#frag%20ment"); URL url2 = new UrlBuilder(url1).build(); URL url3 = new UrlBuilder(url2).build(); assertEquals("https://test.com/foo%20bar/%5BOne%5D?q=white+space#frag%20ment", url2.toExternalForm()); assertEquals("https://test.com/foo%20bar/%5BOne%5D?q=white+space#frag%20ment", url3.toExternalForm()); } @Test void urlPathIsPreservedDoesntDoubleEncode() throws MalformedURLException { URL url1 = new URL("https://test.com/[foo] bar+/%5BOne%5D?q=white space#frag ment"); URL url2 = new UrlBuilder(url1).build(); URL url3 = new UrlBuilder(url2).build(); assertEquals("https://test.com/%5Bfoo%5D%20bar+/%5BOne%5D?q=white+space#frag%20ment", url2.toExternalForm()); assertEquals("https://test.com/%5Bfoo%5D%20bar+/%5BOne%5D?q=white+space#frag%20ment", url3.toExternalForm()); } @Test void connectToEncodedUrl() { Connection connect = Jsoup.connect("https://example.com/a%20b%20c?query+string"); URL url = connect.request().url(); assertEquals("https://example.com/a%20b%20c?query+string", url.toExternalForm()); } @Test void encodedUrlPathIsPreserved() { // https://github.com/jhy/jsoup/issues/1952 Connection connect = Jsoup.connect("https://example.com/%2B32"); URL url = connect.request().url(); assertEquals("https://example.com/%2B32", url.toExternalForm()); } @Test void urlPathPlusIsPreserved() { // https://github.com/jhy/jsoup/issues/1952 Connection connect = Jsoup.connect("https://example.com/123+456"); URL url = connect.request().url(); assertEquals("https://example.com/123+456", url.toExternalForm()); } @Test public void noUrlThrowsValidationError() throws IOException { HttpConnection con = new HttpConnection(); boolean threw = false; try { con.execute(); } catch (IllegalArgumentException e) { threw = true; assertEquals("URL not set. Make sure to call #url(...) before executing the request.", e.getMessage()); } assertTrue(threw); } @Test public void handlesHeaderEncodingOnRequest() { Connection.Request req = new HttpConnection.Request(); req.addHeader("xxx", "é"); } @Test public void supportsInternationalDomainNames() throws MalformedURLException { String idn = "https://www.测试.测试/foo.html?bar"; String puny = "https://www.xn--0zwm56d.xn--0zwm56d/foo.html?bar"; Connection con = Jsoup.connect(idn); assertEquals(puny, con.request().url().toExternalForm()); HttpConnection.Request req = new HttpConnection.Request(); req.url(new URL(idn)); assertEquals(puny, req.url().toExternalForm()); } @Test void supportsIdnWithPort() throws MalformedURLException { String idn = "https://www.测试.测试:9001/foo.html?bar"; String puny = "https://www.xn--0zwm56d.xn--0zwm56d:9001/foo.html?bar"; Connection con = Jsoup.connect(idn); assertEquals(puny, con.request().url().toExternalForm()); HttpConnection.Request req = new HttpConnection.Request(); req.url(new URL(idn)); assertEquals(puny, req.url().toExternalForm()); } @Test public void validationErrorsOnExecute() throws IOException { Connection con = new HttpConnection(); boolean urlThrew = false; try { con.execute(); } catch (IllegalArgumentException e) { urlThrew = e.getMessage().contains("URL"); } assertTrue(urlThrew); } @Test void testMalformedException() { boolean threw = false; try { Jsoup.connect("jsoup.org/test"); } catch (IllegalArgumentException e) { threw = true; assertEquals("The supplied URL, 'jsoup.org/test', is malformed. Make sure it is an absolute URL, and starts with 'http://' or 'https://'. See https://jsoup.org/cookbook/extracting-data/working-with-urls", e.getMessage()); } assertTrue(threw); } @Test void setHeaderWithUnicodeValue() { Connection connect = Jsoup.connect("https://example.com"); String value = "/foo/我的"; connect.header("Key", value); String actual = connect.request().header("Key"); assertEquals(value, actual); } @Test void setAuth() throws MalformedURLException { Connection con = Jsoup.newSession(); assertNull(con.request().auth()); RequestAuthenticator auth1 = new RequestAuthenticator() { @Override public PasswordAuthentication authenticate(Context auth) { return auth.credentials("foo", "bar"); } }; RequestAuthenticator auth2 = new RequestAuthenticator() { @Override public PasswordAuthentication authenticate(Context auth) { return auth.credentials("qux", "baz"); } }; con.auth(auth1); assertSame(con.request().auth(), auth1); con.auth(auth2); assertSame(con.request().auth(), auth2); con.request().auth(auth1); assertSame(con.request().auth(), auth1); PasswordAuthentication creds = auth1.authenticate( new RequestAuthenticator.Context(new URL("http://example.com"), Authenticator.RequestorType.SERVER, "Realm")); assertNotNull(creds); assertEquals("foo", creds.getUserName()); assertEquals("bar", new String(creds.getPassword())); } /* Tests for fixHeaderEncoding. We are handling two cases when a server sends a header in UTF8. The JVM will decode that in 8859 (per the RFC) and we'll get mojibake, and so we try to fix it. On Android, will be decoded correctly, so should not be modified. */ static String mojibake(String input) { // simulate mojibake by encoding in UTF-8 and decoding in ISO-8859-1 return new String(input.getBytes(StandardCharsets.UTF_8), StandardCharsets.ISO_8859_1); } @ParameterizedTest @ValueSource(strings = {"search.php?moji=我的", "latin=café", "🍕", "ascii"}) void fixesHeaderEncodingIfRequired(String input) { // if the input was mojibaked, we fix it; otherwise is passed // https://github.com/jhy/jsoup/issues/2011 assertEquals(input, fixHeaderEncoding(input)); assertEquals(input, fixHeaderEncoding(mojibake(input))); } } ================================================ FILE: src/test/java/org/jsoup/helper/RegexTest.java ================================================ package org.jsoup.helper; import org.jsoup.select.QueryParser; import org.jsoup.select.Selector; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.ValueSource; import static org.junit.jupiter.api.Assertions.*; public class RegexTest { private boolean originalUseRe2j; // track original setting @BeforeEach void setUp() { originalUseRe2j = Regex.wantsRe2j(); } @AfterEach void tearDown() { Regex.wantsRe2j(originalUseRe2j); // restore original setting } @ParameterizedTest @ValueSource(booleans = {false, true}) void testRegexDelegates(boolean useRe2j) { Regex.wantsRe2j(useRe2j); assertEquals(Regex.usingRe2j(), useRe2j); String pattern = "(\\d+)"; String input = "12345"; Regex regex = Regex.compile(pattern); Regex.Matcher matcher = regex.matcher(input); assertTrue(matcher.find()); } @Test void jdkSupportsBackreferenceMatches() { Regex.wantsRe2j(false); String pattern = "(\\w+)\\s+\\1"; // backreference to group 1 String input = "hello hello"; Regex regex = Regex.compile(pattern); Regex.Matcher matcher = regex.matcher(input); assertTrue(matcher.find()); } @Test void re2jRejectsBackreferenceThrows() { Regex.wantsRe2j(true); String pattern = "(\\w+)\\s+\\1"; // backreference unsupported by RE2J assertThrows(ValidationException.class, () -> Regex.compile(pattern)); // and not the rej2 PatternSyntaxException } @ParameterizedTest @ValueSource(booleans = {false, true}) void queryParserThrowsSelectorExceptionOnMalformedRegex(boolean useRe2j) { Regex.wantsRe2j(useRe2j); String query = "[attr~=(unclosed]"; boolean threw = false; try { QueryParser.parse(query); } catch (Selector.SelectorParseException e) { threw = true; assertTrue(e.getMessage().contains("Pattern syntax error")); } assertTrue(threw); } } ================================================ FILE: src/test/java/org/jsoup/helper/ValidateTest.java ================================================ package org.jsoup.helper; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; @SuppressWarnings("deprecation") // keeps tests for ensureNotNull public class ValidateTest { @Test public void testNotNull() { Validate.notNull("foo"); boolean threw = false; try { Validate.notNull(null); } catch (IllegalArgumentException e) { threw = true; } Assertions.assertTrue(threw); } @Test void stacktraceFiltersOutValidateClass() { boolean threw = false; try { Validate.notNull(null); } catch (ValidationException e) { threw = true; assertEquals("Object must not be null", e.getMessage()); StackTraceElement[] stackTrace = e.getStackTrace(); for (StackTraceElement trace : stackTrace) { assertNotEquals(trace.getClassName(), Validate.class.getName()); } assertTrue(stackTrace.length >= 1); } Assertions.assertTrue(threw); } @Test void nonnullParam() { boolean threw = true; try { Validate.notNullParam(null, "foo"); } catch (ValidationException e) { assertEquals("The parameter 'foo' must not be null.", e.getMessage()); } assertTrue(threw); } @Test public void testWtf() { boolean threw = false; try { Validate.wtf("Unexpected state reached"); } catch (IllegalStateException e) { threw = true; assertEquals("Unexpected state reached", e.getMessage()); } assertTrue(threw); } @Test public void testEnsureNotNull() { // Test with a non-null object Object obj = new Object(); assertSame(obj, Validate.ensureNotNull(obj)); // Test with a null object boolean threw = false; try { Validate.ensureNotNull(null); } catch (ValidationException e) { threw = true; assertEquals("Object must not be null", e.getMessage()); } assertTrue(threw); } @Test public void testEnsureNotNullWithMessage() { // Test with a non-null object Object obj = new Object(); assertSame(obj, Validate.ensureNotNull(obj, "Object must not be null")); // Test with a null object boolean threw = false; try { Validate.ensureNotNull(null, "Custom error message"); } catch (ValidationException e) { threw = true; assertEquals("Custom error message", e.getMessage()); } assertTrue(threw); } @Test public void testEnsureNotNullWithFormattedMessage() { // Test with a non-null object Object obj = new Object(); assertSame(obj, Validate.ensureNotNull(obj, "Object must not be null: %s", "additional info")); // Test with a null object boolean threw = false; try { Validate.ensureNotNull(null, "Object must not be null: %s", "additional info"); } catch (ValidationException e) { threw = true; assertEquals("Object must not be null: additional info", e.getMessage()); } assertTrue(threw); } @Test void expectNotNull() { String foo = "Foo"; String foo2 = Validate.expectNotNull(foo); assertSame(foo, foo2); // Test with a null object String bar = null; boolean threw = false; try { Validate.expectNotNull(bar); } catch (ValidationException e) { threw = true; assertEquals("Object must not be null", e.getMessage()); } assertTrue(threw); } @Test public void testNotNullParam() { // Test with a non-null object Object obj = new Object(); Validate.notNullParam(obj, "param"); // Test with a null object boolean threw = false; try { Validate.notNullParam(null, "param"); } catch (ValidationException e) { threw = true; assertEquals("The parameter 'param' must not be null.", e.getMessage()); } assertTrue(threw); } @Test public void testNotEmpty() { // Test with a non-empty string String str = "foo"; Validate.notEmpty(str); // Test with an empty string boolean threw = false; try { Validate.notEmpty(""); } catch (ValidationException e) { threw = true; assertEquals("String must not be empty", e.getMessage()); } assertTrue(threw); // Test with a null string threw = false; try { Validate.notEmpty(null); } catch (ValidationException e) { threw = true; assertEquals("String must not be empty", e.getMessage()); } assertTrue(threw); } @Test public void testIsTrue() { // Test with a true value Validate.isTrue(true); // Test with a false value boolean threw = false; try { Validate.isTrue(false); } catch (ValidationException e) { threw = true; assertEquals("Must be true", e.getMessage()); } assertTrue(threw); } @Test public void testIsFalse() { // Test with a false value Validate.isFalse(false); // Test with a true value boolean threw = false; try { Validate.isFalse(true); } catch (ValidationException e) { threw = true; assertEquals("Must be false", e.getMessage()); } assertTrue(threw); } @Test public void testAssertFail() { boolean result = false; boolean threw = false; try { result = Validate.assertFail("This should fail"); } catch (ValidationException e) { threw = true; assertEquals("This should fail", e.getMessage()); } assertTrue(threw); assertFalse(result); } @Test public void testNotEmptyParam() { // Test with a non-empty string Validate.notEmptyParam("foo", "param"); // Test with an empty string boolean threw = false; try { Validate.notEmptyParam("", "param"); } catch (ValidationException e) { threw = true; assertEquals("The 'param' parameter must not be empty.", e.getMessage()); } assertTrue(threw); // Test with a null string threw = false; try { Validate.notEmptyParam(null, "param"); } catch (ValidationException e) { threw = true; assertEquals("The 'param' parameter must not be empty.", e.getMessage()); } assertTrue(threw); } @Test public void testNoNullElementsWithMessage() { // Test with an array with no null elements Object[] array = {new Object(), new Object()}; Validate.noNullElements(array, "Custom error message"); // Test with an array containing a null element boolean threw = false; try { Validate.noNullElements(new Object[]{new Object(), null}, "Custom error message"); } catch (ValidationException e) { threw = true; assertEquals("Custom error message", e.getMessage()); } assertTrue(threw); } @Test public void testNotEmptyWithMessage() { // Test with a non-empty string Validate.notEmpty("foo", "Custom error message"); // Test with an empty string boolean threw = false; try { Validate.notEmpty("", "Custom error message"); } catch (ValidationException e) { threw = true; assertEquals("Custom error message", e.getMessage()); } assertTrue(threw); // Test with a null string threw = false; try { Validate.notEmpty(null, "Custom error message"); } catch (ValidationException e) { threw = true; assertEquals("Custom error message", e.getMessage()); } assertTrue(threw); } } ================================================ FILE: src/test/java/org/jsoup/helper/W3CDomTest.java ================================================ package org.jsoup.helper; import org.jsoup.Jsoup; import org.jsoup.TextUtil; import org.jsoup.integration.ParseTest; import org.jsoup.nodes.Element; import org.jsoup.nodes.TextNode; import org.jsoup.parser.Parser; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.StringReader; import java.nio.charset.StandardCharsets; import java.util.Locale; import java.util.Map; import java.util.stream.Stream; import static org.jsoup.TextUtil.normalizeSpaces; import static org.jsoup.nodes.Document.OutputSettings.Syntax.xml; import static org.junit.jupiter.api.Assertions.*; public class W3CDomTest { private static Document parseXml(String xml, boolean nameSpaceAware) { try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(nameSpaceAware); DocumentBuilder builder = factory.newDocumentBuilder(); builder.setEntityResolver((publicId, systemId) -> { if (systemId.contains("about:legacy-compat")) { // return new InputSource(new StringReader("")); } else { return null; } }); Document dom = builder.parse(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))); dom.normalizeDocument(); return dom; } catch (Exception e) { throw new IllegalStateException(e); } } @Test public void simpleConversion() { String html = "W3c

Text

What"; assertEquals(expected, TextUtil.stripNewlines(out)); Document roundTrip = parseXml(out, true); assertEquals("Text", roundTrip.getElementsByTagName("p").item(0).getTextContent()); // check we can set properties Map properties = W3CDom.OutputXml(); properties.put(OutputKeys.INDENT, "yes"); String furtherOut = W3CDom.asString(wDoc, properties); assertTrue(furtherOut.length() > out.length()); // wanted to assert formatting, but actual indentation is platform specific so breaks in CI String furtherExpected = "W3c

Text

What"; assertEquals(furtherExpected, TextUtil.stripNewlines(furtherOut)); // on windows, DOM will write newlines as \r\n } @Test public void namespacePreservation() throws IOException { File in = ParseTest.getFile("/htmltests/namespaces.xhtml"); org.jsoup.nodes.Document jsoupDoc; jsoupDoc = Jsoup.parse(in, "UTF-8", "", Parser.xmlParser()); Document doc; org.jsoup.helper.W3CDom jDom = new org.jsoup.helper.W3CDom(); doc = jDom.fromJsoup(jsoupDoc); Node htmlEl = doc.getChildNodes().item(0); assertEquals("http://www.w3.org/1999/xhtml", htmlEl.getNamespaceURI()); assertEquals("html", htmlEl.getLocalName()); assertEquals("html", htmlEl.getNodeName()); // inherits default namespace Node head = htmlEl.getFirstChild().getNextSibling(); assertEquals("http://www.w3.org/1999/xhtml", head.getNamespaceURI()); assertEquals("head", head.getLocalName()); assertEquals("head", head.getNodeName()); Node epubTitle = htmlEl.getChildNodes().item(3).getChildNodes().item(3); assertEquals("Check", epubTitle.getTextContent()); assertEquals("http://www.idpf.org/2007/ops", epubTitle.getNamespaceURI()); assertEquals("title", epubTitle.getLocalName()); assertEquals("epub:title", epubTitle.getNodeName()); Node xSection = epubTitle.getNextSibling().getNextSibling(); assertEquals("urn:test", xSection.getNamespaceURI()); assertEquals("section", xSection.getLocalName()); assertEquals("x:section", xSection.getNodeName()); // https://github.com/jhy/jsoup/issues/977 // does not keep last set namespace Node svg = xSection.getNextSibling().getNextSibling(); assertEquals("http://www.w3.org/2000/svg", svg.getNamespaceURI()); assertEquals("svg", svg.getLocalName()); assertEquals("svg", svg.getNodeName()); Node path = svg.getChildNodes().item(1); assertEquals("http://www.w3.org/2000/svg", path.getNamespaceURI()); assertEquals("path", path.getLocalName()); assertEquals("path", path.getNodeName()); Node clip = path.getChildNodes().item(1); assertEquals("http://example.com/clip", clip.getNamespaceURI()); assertEquals("clip", clip.getLocalName()); assertEquals("clip", clip.getNodeName()); assertEquals("456", clip.getTextContent()); Node picture = svg.getNextSibling().getNextSibling(); assertEquals("http://www.w3.org/1999/xhtml", picture.getNamespaceURI()); assertEquals("picture", picture.getLocalName()); assertEquals("picture", picture.getNodeName()); Node img = picture.getFirstChild(); assertEquals("http://www.w3.org/1999/xhtml", img.getNamespaceURI()); assertEquals("img", img.getLocalName()); assertEquals("img", img.getNodeName()); } @Test public void handlesInvalidAttributeNames() { String html = ""; org.jsoup.nodes.Document jsoupDoc; jsoupDoc = Jsoup.parse(html); Element body = jsoupDoc.select("body").first(); assertTrue(body.hasAttr("\"")); // actually an attribute with key '"'. Correct per HTML5 spec, but w3c xml dom doesn't dig it assertTrue(body.hasAttr("name\"")); Document w3Doc = W3CDom.convert(jsoupDoc); String xml = W3CDom.asString(w3Doc, W3CDom.OutputXml()); assertEquals("", xml); } @Test public void htmlInputDocMaintainsHtmlAttributeNames() { String html = "

unicode attr names

"; org.jsoup.nodes.Document jsoupDoc; jsoupDoc = Jsoup.parse(html); Document w3Doc = W3CDom.convert(jsoupDoc); String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml()); String expected = "

unicode attr names

"; assertEquals(expected, TextUtil.stripNewlines(out)); } @Test public void xmlInputDocMaintainsHtmlAttributeNames() { String html = "

unicode attr names coerced

"; org.jsoup.nodes.Document jsoupDoc; jsoupDoc = Jsoup.parse(html); jsoupDoc.outputSettings().syntax(xml); Document w3Doc = W3CDom.convert(jsoupDoc); String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml()); String expected = "

unicode attr names coerced

"; assertEquals(expected, TextUtil.stripNewlines(out)); } @Test public void handlesInvalidTagAsText() { org.jsoup.nodes.Document jsoup = Jsoup.parse("<インセンティブで高収入!>Text

More

"); Document w3Doc = W3CDom.convert(jsoup); String xml = W3CDom.asString(w3Doc, W3CDom.OutputXml()); assertEquals("<インセンティブで高収入!>Text

More

", xml); } @Test void handlesHtmlElsWithLt() { // In HTML, elements can be named "foo). Test that we can convert to W3C, that we can HTML parse our HTML serial, XML parse our XML serial, and W3C XML parse the XML serial and the W3C serial // And similarly attributes may have "<" in their name // https://github.com/jhy/jsoup/issues/2259 String input = "Text"; String xmlExpect = "Text"; // rewrites < to _ in el and attr // html round trips org.jsoup.nodes.Document htmlDoc = Jsoup.parse(input); String htmlSerial = htmlDoc.body().html(); assertEquals(input, normalizeSpaces(htmlSerial)); // same as input Element htmlRound = Jsoup.parse(htmlSerial).body(); assertTrue(htmlDoc.body().hasSameValue(htmlRound)); // xml round trips htmlDoc.outputSettings().syntax(xml); String asXml = htmlDoc.body().html(); assertEquals(xmlExpect, normalizeSpaces(asXml)); // -> org.jsoup.nodes.Document xmlDoc = Jsoup.parse(asXml); String xmlSerial = xmlDoc.body().html(); assertEquals(xmlExpect, normalizeSpaces(xmlSerial)); // same as xmlExpect Element xmlRound = Jsoup.parse(xmlSerial).body(); assertTrue(xmlDoc.body().hasSameValue(xmlRound)); // Can W3C parse that XML Document w3cXml = parseXml(asXml, true); NodeList w3cXmlNodes = w3cXml.getElementsByTagName("foo_bar"); assertEquals(1, w3cXmlNodes.getLength()); assertEquals("123", w3cXmlNodes.item(0).getAttributes().getNamedItem("attr_name").getTextContent()); // Can convert to W3C Document w3cDoc = W3CDom.convert(htmlDoc); NodeList w3cNodes = w3cDoc.getElementsByTagName("foo_bar"); assertEquals(1, w3cNodes.getLength()); assertEquals("123", w3cNodes.item(0).getAttributes().getNamedItem("attr_name").getTextContent()); } @Test public void canConvertToCustomDocument() throws ParserConfigurationException { org.jsoup.nodes.Document document = Jsoup.parse("
"); DocumentBuilderFactory localDocumentBuilderFactory = DocumentBuilderFactory.newInstance(); Document customDocumentResult = localDocumentBuilderFactory.newDocumentBuilder().newDocument(); W3CDom w3cDom = new W3CDom(); w3cDom.convert(document, customDocumentResult); String html = W3CDom.asString(customDocumentResult, W3CDom.OutputHtml()); assertEquals("
", html); } @Test public void treatsUndeclaredNamespaceAsLocalName() { String html = "One"; org.jsoup.nodes.Document doc = Jsoup.parse(html); Document w3Doc = new W3CDom().fromJsoup(doc); Node htmlEl = w3Doc.getFirstChild(); assertEquals("http://www.w3.org/1999/xhtml", htmlEl.getNamespaceURI()); assertEquals("html", htmlEl.getLocalName()); assertEquals("html", htmlEl.getNodeName()); Node fb = htmlEl.getFirstChild().getNextSibling().getFirstChild(); assertEquals("http://www.w3.org/1999/xhtml", fb.getNamespaceURI()); assertEquals("like", fb.getLocalName()); assertEquals("fb:like", fb.getNodeName()); } @Test public void xmlnsXpathTest() throws XPathExpressionException { W3CDom w3c = new W3CDom(); String html = "
hello
"; Document dom = w3c.fromJsoup(Jsoup.parse(html)); NodeList nodeList = xpath(dom, "//*[local-name()=\"body\"]");// namespace aware; HTML namespace is default assertEquals("div", nodeList.item(0).getLocalName()); // default output is namespace aware, so query needs to be as well html = "
hello
"; dom = w3c.fromJsoup(Jsoup.parse(html)); nodeList = xpath(dom, "//body"); assertNull(nodeList); // no matches dom = w3c.fromJsoup(Jsoup.parse(html)); nodeList = xpath(dom, "//*[local-name()=\"body\"]"); assertNotNull(nodeList); assertEquals(1, nodeList.getLength()); assertEquals("div", nodeList.item(0).getLocalName()); assertEquals("http://www.w3.org/1999/xhtml", nodeList.item(0).getNamespaceURI()); assertNull(nodeList.item(0).getPrefix()); // get rid of the name space awareness String xml = w3c.asString(dom); dom = parseXml(xml, false); Node item = (Node) xpath(dom, "//body"); assertEquals("body", item.getNodeName()); assertNull(item.getNamespaceURI()); assertNull(item.getPrefix()); // put back, will get zero dom = parseXml(xml, true); nodeList = xpath(dom, "//body"); assertNull(nodeList); } @Test public void xhtmlNoNamespace() throws XPathExpressionException { W3CDom w3c = new W3CDom(); String html = "
hello
"; w3c.namespaceAware(false); Document dom = w3c.fromJsoup(Jsoup.parse(html)); NodeList nodeList = xpath(dom, "//body");// no namespace assertEquals(1, nodeList.getLength()); assertEquals("div", nodeList.item(0).getLocalName()); } @Test void canDisableNamespaces() throws XPathExpressionException { W3CDom w3c = new W3CDom(); assertTrue(w3c.namespaceAware()); w3c.namespaceAware(false); assertFalse(w3c.namespaceAware()); String html = "
hello
"; Document dom = w3c.fromJsoup(Jsoup.parse(html)); NodeList nodeList = xpath(dom, "//body");// no ns, so needs no prefix assertEquals("div", nodeList.item(0).getLocalName()); } private NodeList xpath(Document w3cDoc, String query) throws XPathExpressionException { XPathExpression xpath = XPathFactory.newInstance().newXPath().compile(query); return ((NodeList) xpath.evaluate(w3cDoc, XPathConstants.NODE)); } @Test public void testRoundTripDoctype() { // because we have Saxon on the test classpath, the transformer will change to that, and so case may change (e.g. Java base is META, Saxon is meta for HTML) String base = "

One

"; assertEqualsIgnoreCase("

One

", output(base, true)); assertEqualsIgnoreCase("

One

", output(base, false)); String publicDoc = ""; assertEqualsIgnoreCase("", output(publicDoc, true)); // different impls will have different XML formatting. OpenJDK 13 default gives this: but others have , so just check start assertTrue(output(publicDoc, false).startsWith(""; assertEqualsIgnoreCase("", output(systemDoc, true)); assertEqualsIgnoreCase("", output(systemDoc, false)); String legacyDoc = ""; assertEqualsIgnoreCase("", output(legacyDoc, true)); assertEqualsIgnoreCase("", output(legacyDoc, false)); String noDoctype = "

One

"; assertEqualsIgnoreCase("

One

", output(noDoctype, true)); assertEqualsIgnoreCase("

One

", output(noDoctype, false)); } private String output(String in, boolean modeHtml) { org.jsoup.nodes.Document jdoc = Jsoup.parse(in); Document w3c = W3CDom.convert(jdoc); Map properties = modeHtml ? W3CDom.OutputHtml() : W3CDom.OutputXml(); return normalizeSpaces(W3CDom.asString(w3c, properties)); } private void assertEqualsIgnoreCase(String want, String have) { assertEquals(want.toLowerCase(Locale.ROOT), have.toLowerCase(Locale.ROOT)); } @Test public void canOutputHtmlWithoutNamespace() { String html = "

One

"; org.jsoup.nodes.Document jdoc = Jsoup.parse(html); W3CDom w3c = new W3CDom(); w3c.namespaceAware(false); String asHtml = W3CDom.asString(w3c.fromJsoup(jdoc), W3CDom.OutputHtml()); String asXtml = W3CDom.asString(w3c.fromJsoup(jdoc), W3CDom.OutputXml()); assertEqualsIgnoreCase( "

one

", asHtml); assertEqualsIgnoreCase( "

One

", asXtml); } @Test public void convertsElementsAndMaintainsSource() { org.jsoup.nodes.Document jdoc = Jsoup.parse("

One

Two"); W3CDom w3CDom = new W3CDom(); Element jDiv = jdoc.selectFirst("div"); assertNotNull(jDiv); Document doc = w3CDom.fromJsoup(jDiv); Node div = w3CDom.contextNode(doc); assertEquals("div", div.getLocalName()); assertEquals(jDiv, div.getUserData(W3CDom.SourceProperty)); Node textNode = div.getFirstChild().getFirstChild(); assertEquals("One", textNode.getTextContent()); assertEquals(Node.TEXT_NODE, textNode.getNodeType()); org.jsoup.nodes.TextNode jText = (TextNode) jDiv.childNode(0).childNode(0); assertEquals(jText, textNode.getUserData(W3CDom.SourceProperty)); } @Test public void canXmlParseCdataNodes() throws XPathExpressionException { String html = "

5 && 6

"; org.jsoup.nodes.Document jdoc = Jsoup.parse(html); jdoc.outputSettings().syntax(xml); String xml = jdoc.body().html(); assertTrue(xml.contains("")); // as asserted in ElementTest Document doc = parseXml(xml, false); NodeList list = xpath(doc, "//script"); assertEquals(2, list.getLength()); Node scriptComment = list.item(0); // will be the cdata node assertEquals("//", scriptComment.getTextContent()); Node script = list.item(1); assertEquals("\n" + "1 && 2\n" + "//", script.getTextContent()); } @Test public void handlesEmptyDoctype() { String html = "Foo"; org.jsoup.nodes.Document jdoc = Jsoup.parse(html); Document doc = (new W3CDom()).fromJsoup(jdoc); assertNull(doc.getDoctype()); assertEquals("Foo", doc.getFirstChild().getTextContent()); } @Test void testHtmlParseAttributesAreCaseInsensitive() throws IOException { // https://github.com/jhy/jsoup/issues/981 String html = "\n" + "\n" + "\"Alt\n" + "\"Alt\n" + "\n" + ""; org.jsoup.nodes.Document jsoupDoc; jsoupDoc = Jsoup.parse(html); org.jsoup.helper.W3CDom jDom = new org.jsoup.helper.W3CDom(); Document doc = jDom.fromJsoup(jsoupDoc); org.w3c.dom.Element body = (org.w3c.dom.Element) doc.getDocumentElement().getElementsByTagName("body").item(0); NodeList imgs = body.getElementsByTagName("img"); assertEquals(2, imgs.getLength()); org.w3c.dom.Element first = (org.w3c.dom.Element) imgs.item(0); assertEquals(first.getAttributes().getLength(), 2); String img1 = first.getAttribute("src"); assertEquals("firstImage.jpg", img1); String alt1 = first.getAttribute("alt"); assertEquals("Alt one", alt1); org.w3c.dom.Element second = (org.w3c.dom.Element) imgs.item(1); assertEquals(second.getAttributes().getLength(), 2); String img2 = second.getAttribute("src"); assertEquals("secondImage.jpg", img2); String alt2 = second.getAttribute("alt"); assertEquals("Alt two", alt2); } @ParameterizedTest @MethodSource("parserProvider") void doesNotExpandEntities(Parser parser) { // Tests that the billion laughs attack doesn't expand entities; also for XXE // Not impacted because jsoup doesn't parse the entities within the doctype, and so won't get to the w3c. // Added to confirm, and catch if that ever changes String billionLaughs = "\n" + "\n" + " \n" + "]>\n" + "

&lol1;

"; org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(billionLaughs, parser); W3CDom w3cDom = new W3CDom(); org.w3c.dom.Document w3cDoc = w3cDom.fromJsoup(jsoupDoc); assertNotNull(w3cDoc); // select the p and make sure it's unexpanded NodeList p = w3cDoc.getElementsByTagName("p"); assertEquals(1, p.getLength()); assertEquals("&lol1;", p.item(0).getTextContent()); // Check the string String string = W3CDom.asString(w3cDoc, W3CDom.OutputXml()); assertFalse(string.contains("lololol")); assertTrue(string.contains("&lol1;")); } @Test void undeclaredAttrNamespaceAsString() { // https://github.com/jhy/jsoup/issues/2087 W3CDom w3CDom = new W3CDom(); String html = "
"; org.jsoup.nodes.Document jdoc = Jsoup.parse(html); org.w3c.dom.Document w3CDoc = w3CDom.fromJsoup(jdoc); String xml = w3CDom.asString(w3CDoc); assertEquals("
", xml); } @Test void declaredNamespaceIsUsed() { W3CDom w3CDom = new W3CDom(); String html = "
"; org.jsoup.nodes.Document jdoc = Jsoup.parse(html); org.w3c.dom.Document w3CDoc = w3CDom.fromJsoup(jdoc); String xml = w3CDom.asString(w3CDoc); assertEquals("
", xml); } @Test void nestedElementsWithUndeclaredNamespace() { W3CDom w3CDom = new W3CDom(); String html = "
"; org.jsoup.nodes.Document jdoc = Jsoup.parse(html); org.w3c.dom.Document w3CDoc = w3CDom.fromJsoup(jdoc); String xml = w3CDom.asString(w3CDoc); assertEquals("
", xml); } private static Stream parserProvider() { return Stream.of( Arguments.of(Parser.htmlParser()), Arguments.of(Parser.xmlParser()) ); } } ================================================ FILE: src/test/java/org/jsoup/integration/Benchmark.java ================================================ package org.jsoup.integration; import java.util.Date; /** Does an A/B test on two methods, and prints out how long each took. @author Jonathan Hedley, jonathan@hedley.net */ public class Benchmark { public static void run(Runnable a, Runnable b, int count) { long aMillis; long bMillis; print("Running test A (x%d)", count); aMillis = time(a, count); print("Running test B"); bMillis = time(b, count); print("\nResults:"); print("A: %.2fs", aMillis / 1000f); print("B: %.2fs", bMillis / 1000f); print("\nB ran in %.2f %% time of A\n", (bMillis *1f / aMillis * 1f) * 100f); } private static long time(Runnable test, int count) { Date start = new Date(); for (int i = 0; i < count; i++) { test.run(); } Date end = new Date(); return end.getTime() - start.getTime(); } private static void print(String msgFormat, Object... msgParams) { System.out.println(String.format(msgFormat, msgParams)); } } ================================================ FILE: src/test/java/org/jsoup/integration/ConnectIT.java ================================================ package org.jsoup.integration; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.helper.DataUtil; import org.jsoup.integration.servlets.EchoServlet; import org.jsoup.integration.servlets.FileServlet; import org.jsoup.integration.servlets.SlowRider; import org.jsoup.internal.SharedConstants; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.parser.StreamParser; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import java.io.BufferedInputStream; import java.io.IOException; import java.io.UncheckedIOException; import java.net.SocketTimeoutException; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.concurrent.atomic.AtomicBoolean; import static org.junit.jupiter.api.Assertions.*; /** * Failsafe integration tests for Connect methods. These take a bit longer to run, so included as Integ, not Unit, tests. */ public class ConnectIT { @BeforeAll public static void setUp() { TestServer.start(); System.setProperty(SharedConstants.UseHttpClient, "false"); // use the default UrlConnection. See HttpClientConnectIT for other version } // Slow Rider tests. @Test public void canInterruptBodyStringRead() throws InterruptedException { final String[] body = new String[1]; Thread runner = new Thread(() -> { try { Connection.Response res = Jsoup.connect(SlowRider.Url) .timeout(15 * 1000) .execute(); body[0] = res.body(); } catch (IOException e) { throw new RuntimeException(e); } }); runner.start(); Thread.sleep(1000 * 3); runner.interrupt(); assertTrue(runner.isInterrupted()); runner.join(); assertTrue(body[0].length() > 0); assertTrue(body[0].contains("

Are you still there?")); } @Test public void canInterruptDocumentRead() throws InterruptedException { long start = System.currentTimeMillis(); final String[] body = new String[1]; Thread runner = new Thread(() -> { try { Connection.Response res = Jsoup.connect(SlowRider.Url) .timeout(15 * 1000) .execute(); body[0] = res.parse().text(); } catch (IOException e) { throw new RuntimeException(e); } }); runner.start(); Thread.sleep(3 * 1000); runner.interrupt(); assertTrue(runner.isInterrupted()); runner.join(); long end = System.currentTimeMillis(); // check we are between 3 and connect timeout seconds (should be just over 3; but allow some slack for slow CI runners) assertTrue(end - start > 3 * 1000); assertTrue(end - start < 10 * 1000); } @Test public void canInterruptThenJoinASpawnedThread() throws InterruptedException { // https://github.com/jhy/jsoup/issues/1991 AtomicBoolean ioException = new AtomicBoolean(); Thread runner = new Thread(() -> { try { while (!Thread.currentThread().isInterrupted()) { Document doc = Jsoup.connect(SlowRider.Url) .timeout(30000) .get(); } } catch (IOException e) { ioException.set(true); // don't expect to catch, because the outer sleep will complete before this timeout } }); runner.start(); Thread.sleep(2 * 1000); runner.interrupt(); runner.join(); assertFalse(ioException.get()); } @Test public void totalTimeout() throws IOException { int timeout = 3 * 1000; long start = System.currentTimeMillis(); boolean threw = false; try { Jsoup.connect(SlowRider.Url).timeout(timeout).get(); } catch (SocketTimeoutException e) { long end = System.currentTimeMillis(); long took = end - start; assertTrue(took > timeout, ("Time taken was " + took)); assertTrue(took < timeout * 1.8, ("Time taken was " + took)); threw = true; } assertTrue(threw); } @Test public void slowReadOk() throws IOException { // make sure that a slow read that is under the request timeout is still OK Document doc = Jsoup.connect(SlowRider.Url) .data(SlowRider.MaxTimeParam, "2000") // the request completes in 2 seconds .get(); Element h1 = doc.selectFirst("h1"); assertEquals("outatime", h1.text()); } @Test void readFullyThrowsOnTimeout() throws IOException { // tests that response.readFully excepts on timeout boolean caught = false; Connection.Response res = Jsoup.connect(SlowRider.Url).timeout(3000).execute(); try { res.readFully(); } catch (IOException e) { caught = true; } assertTrue(caught); } @Test void readBodyThrowsOnTimeout() throws IOException { // tests that response.readBody excepts on timeout boolean caught = false; Connection.Response res = Jsoup.connect(SlowRider.Url).timeout(3000).execute(); try { res.readBody(); } catch (IOException e) { caught = true; } assertTrue(caught); } @Test void bodyThrowsUncheckedOnTimeout() throws IOException { // tests that response.body unchecked excepts on timeout boolean caught = false; Connection.Response res = Jsoup.connect(SlowRider.Url).timeout(3000).execute(); try { res.body(); } catch (UncheckedIOException e) { caught = true; } assertTrue(caught); } @Test public void infiniteReadSupported() throws IOException { Document doc = Jsoup.connect(SlowRider.Url) .timeout(0) .data(SlowRider.MaxTimeParam, "2000") .get(); Element h1 = doc.selectFirst("h1"); assertEquals("outatime", h1.text()); } @Test void streamParserUncheckedExceptionOnTimeoutInStream() throws IOException { boolean caught = false; try (StreamParser streamParser = Jsoup.connect(SlowRider.Url) .data(SlowRider.MaxTimeParam, "10000") .data(SlowRider.IntroSizeParam, "8000") // 8K to pass first buffer, or the timeout would occur in execute or streamparser() .timeout(4000) // has a 1000 sleep at the start .execute() .streamParser()) { // we should expect to timeout while in stream try { long count = streamParser.stream().count(); } catch (Exception e) { caught = true; UncheckedIOException ioe = (UncheckedIOException) e; IOException cause = ioe.getCause(); //assertInstanceOf(SocketTimeoutException.class, cause); // different JDKs seem to wrap this differently assertInstanceOf(IOException.class, cause); } } assertTrue(caught); } @Test void streamParserCheckedExceptionOnTimeoutInSelect() throws IOException { boolean caught = false; try (StreamParser streamParser = Jsoup.connect(SlowRider.Url) .data(SlowRider.MaxTimeParam, "10000") .data(SlowRider.IntroSizeParam, "8000") // 8K to pass first buffer, or the timeout would occur in execute or streamparser() .timeout(4000) // has a 1000 sleep at the start .execute() .streamParser()) { // we should expect to timeout while in stream try { long count = 0; while (streamParser.selectNext("p") != null) { count++; } } catch (IOException e) { caught = true; } } assertTrue(caught); } private static final int LargeHtmlSize = 280735; @Test public void remainingAfterFirstRead() throws IOException { int bufferSize = 5 * 1024; int capSize = 100 * 1024; String url = FileServlet.urlTo("/htmltests/large.html"); // 280 K try (BufferedInputStream stream = Jsoup.connect(url).maxBodySize(capSize) .execute().bodyStream()) { // simulates parse which does a limited read first stream.mark(bufferSize); ByteBuffer firstBytes = DataUtil.readToByteBuffer(stream, bufferSize); byte[] array = firstBytes.array(); String firstText = new String(array, StandardCharsets.UTF_8); assertTrue(firstText.startsWith("Large")); assertEquals(bufferSize, array.length); boolean fullyRead = stream.read() == -1; assertFalse(fullyRead); // reset and read again stream.reset(); ByteBuffer fullRead = DataUtil.readToByteBuffer(stream, 0); byte[] fullArray = fullRead.array(); // bodyStream is not capped to body size - only for jsoup consumed stream assertTrue(fullArray.length > capSize); assertEquals(LargeHtmlSize, fullRead.limit()); String fullText = new String(fullRead.array(), 0, fullRead.limit(), StandardCharsets.UTF_8); assertTrue(fullText.startsWith(firstText)); assertEquals(LargeHtmlSize, fullText.length()); } } @Test public void noLimitAfterFirstRead() throws IOException { int firstMaxRead = 5 * 1024; String url = FileServlet.urlTo("/htmltests/large.html"); // 280 K try (BufferedInputStream stream = Jsoup.connect(url).execute().bodyStream()) { // simulates parse which does a limited read first stream.mark(firstMaxRead); ByteBuffer firstBytes = DataUtil.readToByteBuffer(stream, firstMaxRead); byte[] array = firstBytes.array(); String firstText = new String(array, StandardCharsets.UTF_8); assertTrue(firstText.startsWith("<html><head><title>Large")); assertEquals(firstMaxRead, array.length); // reset and read fully stream.reset(); ByteBuffer fullRead = DataUtil.readToByteBuffer(stream, 0); assertEquals(LargeHtmlSize, fullRead.limit()); String fullText = new String(fullRead.array(), 0, fullRead.limit(), StandardCharsets.UTF_8); assertTrue(fullText.startsWith(firstText)); assertEquals(LargeHtmlSize, fullText.length()); } } @Test public void bodyStreamConstrainedViaReadFully() throws IOException { int cap = 5 * 1024; String url = FileServlet.urlTo("/htmltests/large.html"); // 280 K try (BufferedInputStream stream = Jsoup .connect(url) .maxBodySize(cap) .execute() .readFully() .bodyStream()) { ByteBuffer cappedRead = DataUtil.readToByteBuffer(stream, 0); assertEquals(cap, cappedRead.limit()); } } @Test public void bodyStreamConstrainedViaBufferUp() throws IOException { int cap = 5 * 1024; String url = FileServlet.urlTo("/htmltests/large.html"); // 280 K try (BufferedInputStream stream = Jsoup .connect(url) .maxBodySize(cap) .execute() .bufferUp() .bodyStream()) { ByteBuffer cappedRead = DataUtil.readToByteBuffer(stream, 0); assertEquals(cap, cappedRead.limit()); } } } ================================================ FILE: src/test/java/org/jsoup/integration/ConnectTest.java ================================================ package org.jsoup.integration; import org.jsoup.Connection; import org.jsoup.HttpStatusException; import org.jsoup.Jsoup; import org.jsoup.Connection.Method; import org.jsoup.TextUtil; import org.jsoup.UnsupportedMimeTypeException; import org.jsoup.helper.DataUtil; import org.jsoup.helper.W3CDom; import org.jsoup.integration.servlets.*; import org.jsoup.internal.SharedConstants; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.FormElement; import org.jsoup.nodes.Node; import org.jsoup.nodes.XmlDeclaration; import org.jsoup.parser.HtmlTreeBuilder; import org.jsoup.parser.Parser; import org.jsoup.parser.StreamParser; import org.jsoup.parser.XmlTreeBuilder; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.MethodSource; import org.junit.jupiter.params.provider.ValueSource; import javax.servlet.http.HttpServletResponse; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.net.Authenticator; import java.net.MalformedURLException; import java.net.URL; import java.net.URLDecoder; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Stream; import static org.jsoup.helper.AuthenticationHandlerTest.MaxAttempts; import static org.jsoup.helper.HttpConnection.CONTENT_TYPE; import static org.jsoup.helper.HttpConnection.MULTIPART_FORM_DATA; import static org.junit.jupiter.api.Assertions.*; /** * Tests Jsoup.connect against a local server. */ public class ConnectTest { private static final int LargeDocFileLen = 280735; private static final int LargeDocTextLen = 269535; private static String echoUrl; @BeforeAll public static void setUp() { TestServer.start(); echoUrl = EchoServlet.Url; System.setProperty(SharedConstants.UseHttpClient, "false"); // use the default UrlConnection. See HttpClientConnectTest for other version } @BeforeEach public void emptyCookieJar() { // empty the cookie jar, so cookie tests are independent. Jsoup.connect("http://example.com").cookieStore().removeAll(); } @Test public void canConnectToLocalServer() throws IOException { String url = HelloServlet.Url; Document doc = Jsoup.connect(url).get(); Element p = doc.selectFirst("p"); assertEquals("Hello, World!", p.text()); } @Test void canConnectToLocalTlsServer() throws IOException { String url = HelloServlet.TlsUrl; Document doc = Jsoup.connect(url).get(); Element p = doc.selectFirst("p"); assertEquals("Hello, World!", p.text()); } @Test public void fetchURl() throws IOException { Document doc = Jsoup.parse(new URL(echoUrl), 10 * 1000); assertTrue(doc.title().contains("Environment Variables")); } @Test public void fetchURIWithWhitespace() throws IOException { Connection con = Jsoup.connect(echoUrl + "#with whitespaces"); Document doc = con.get(); assertTrue(doc.title().contains("Environment Variables")); } @Test public void exceptOnUnsupportedProtocol() { String url = "file://etc/passwd"; boolean threw = false; try { Document doc = Jsoup.connect(url).get(); } catch (MalformedURLException e) { threw = true; assertEquals("java.net.MalformedURLException: Only http & https protocols supported", e.toString()); } catch (IOException e) { } assertTrue(threw); } static String ihVal(String key, Document doc) { final Element first = doc.select("th:contains(" + key + ") + td").first(); return first != null ? first.text() : null; } @Test void statusMessage() throws IOException { Connection con = Jsoup.connect(EchoServlet.Url); Document doc = con.get(); assertEquals("OK", con.response().statusMessage()); } @Test public void throwsExceptionOn404() { String url = EchoServlet.Url; Connection con = Jsoup.connect(url).header(EchoServlet.CodeParam, "404"); boolean threw = false; try { Document doc = con.get(); } catch (HttpStatusException e) { threw = true; assertEquals("org.jsoup.HttpStatusException: HTTP error fetching URL. Status=404, URL=[" + e.getUrl() + "]", e.toString()); assertTrue(e.getUrl().startsWith(url)); assertEquals(404, e.getStatusCode()); } catch (IOException e) { } assertTrue(threw); } @Test public void ignoresExceptionIfSoConfigured() throws IOException { String url = EchoServlet.Url; Connection con = Jsoup.connect(url) .header(EchoServlet.CodeParam, "404") .ignoreHttpErrors(true); Connection.Response res = con.execute(); Document doc = res.parse(); assertEquals(404, res.statusCode()); assertEquals("Not Found", res.statusMessage()); assertEquals("Webserver Environment Variables", doc.title()); } @Test public void doesPost() throws IOException { Document doc = Jsoup.connect(echoUrl) .data("uname", "Jsoup", "uname", "Jonathan", "百", "度一下") .cookie("auth", "token") .post(); assertEquals("POST", ihVal("Method", doc)); assertEquals("gzip", ihVal("Accept-Encoding", doc)); assertEquals("auth=token", ihVal("Cookie", doc)); assertEquals("度一下", ihVal("百", doc)); assertEquals("Jsoup, Jonathan", ihVal("uname", doc)); assertEquals("application/x-www-form-urlencoded; charset=UTF-8", ihVal("Content-Type", doc)); } @Test public void doesPostMultipartWithoutInputstream() throws IOException { Document doc = Jsoup.connect(echoUrl) .header(CONTENT_TYPE, MULTIPART_FORM_DATA) .data("uname", "Jsoup", "uname", "Jonathan", "百", "度一下") .post(); assertTrue(ihVal("Content-Type", doc).contains(MULTIPART_FORM_DATA)); assertTrue(ihVal("Content-Type", doc).contains("boundary")); // should be automatically set assertEquals("Jsoup, Jonathan", ihVal("uname", doc)); assertEquals("度一下", ihVal("百", doc)); } @Test public void canSendSecFetchHeaders() throws IOException { // https://github.com/jhy/jsoup/issues/1461 Document doc = Jsoup.connect(echoUrl) .header("Random-Header-name", "hello") .header("Sec-Fetch-Site", "cross-site") .header("Sec-Fetch-Mode", "cors") .get(); assertEquals("hello", ihVal("Random-Header-name", doc)); assertEquals("cross-site", ihVal("Sec-Fetch-Site", doc)); assertEquals("cors", ihVal("Sec-Fetch-Mode", doc)); } @Test public void secFetchHeadersSurviveRedirect() throws IOException { Document doc = Jsoup .connect(RedirectServlet.Url) .data(RedirectServlet.LocationParam, echoUrl) .header("Random-Header-name", "hello") .header("Sec-Fetch-Site", "cross-site") .header("Sec-Fetch-Mode", "cors") .get(); assertEquals("hello", ihVal("Random-Header-name", doc)); assertEquals("cross-site", ihVal("Sec-Fetch-Site", doc)); assertEquals("cors", ihVal("Sec-Fetch-Mode", doc)); } @Test public void sendsRequestBodyJsonWithData() throws IOException { final String body = "{key:value}"; Document doc = Jsoup.connect(echoUrl) .requestBody(body) .header("Content-Type", "application/json") .data("foo", "true") .post(); assertEquals("POST", ihVal("Method", doc)); assertEquals("application/json", ihVal("Content-Type", doc)); assertEquals("foo=true", ihVal("Query String", doc)); assertEquals(body, ihVal("Post Data", doc)); } @Test public void sendsRequestBodyJsonWithoutData() throws IOException { final String body = "{key:value}"; Document doc = Jsoup.connect(echoUrl) .requestBody(body) .header("Content-Type", "application/json") .post(); assertEquals("POST", ihVal("Method", doc)); assertEquals("application/json", ihVal("Content-Type", doc)); assertEquals(body, ihVal("Post Data", doc)); } @Test public void sendsRequestBody() throws IOException { final String body = "{key:value}"; Document doc = Jsoup.connect(echoUrl) .requestBody(body) .header("Content-Type", "text/plain") .post(); assertEquals("POST", ihVal("Method", doc)); assertEquals("text/plain", ihVal("Content-Type", doc)); assertEquals(body, ihVal("Post Data", doc)); } @Test public void sendsRequestBodyWithUrlParams() throws IOException { final String body = "{key:value}"; Document doc = Jsoup.connect(echoUrl) .requestBody(body) .data("uname", "Jsoup", "uname", "Jonathan", "百", "度一下") .header("Content-Type", "text/plain") // todo - if user sets content-type, we should append postcharset .post(); assertEquals("POST", ihVal("Method", doc)); assertEquals("uname=Jsoup&uname=Jonathan&%E7%99%BE=%E5%BA%A6%E4%B8%80%E4%B8%8B", ihVal("Query String", doc)); assertEquals(body, ihVal("Post Data", doc)); } @Test void sendsRequestBodyStream() throws IOException { final String body = "{key:value}"; InputStream stream = new ByteArrayInputStream(body.getBytes(StandardCharsets.UTF_8)); Document doc = Jsoup.connect(echoUrl) .requestBodyStream(stream) .header("Content-Type", "application/json") .data("foo", "true") .post(); assertEquals("POST", ihVal("Method", doc)); assertEquals("application/json", ihVal("Content-Type", doc)); assertEquals("foo=true", ihVal("Query String", doc)); assertEquals(body, ihVal("Post Data", doc)); } @ParameterizedTest @MethodSource("echoUrls") // http and https public void doesGet(String url) throws IOException { Connection con = Jsoup.connect(url + "?what=the") .userAgent("Mozilla") .referrer("http://example.com") .data("what", "about & me?"); Document doc = con.get(); assertEquals("what=the&what=about+%26+me%3F", ihVal("Query String", doc)); assertEquals("the, about & me?", ihVal("what", doc)); assertEquals("Mozilla", ihVal("User-Agent", doc)); assertEquals("http://example.com", ihVal("Referer", doc)); } @ParameterizedTest @MethodSource("echoUrls") // http and https public void streamParserGet(String url) throws IOException { Connection con = Jsoup.connect(url) .userAgent("Mozilla") .referrer("http://example.com") .data("what", "about & me?"); //final Element first = doc.select("th:contains(" + key + ") + td").first(); try (StreamParser streamer = con.execute().streamParser()) { Element title = streamer.expectFirst("title"); assertEquals("Webserver Environment Variables", title.text()); Element method = streamer.expectNext(echoSelect("Method")); assertEquals("GET", method.text()); Document doc = streamer.document(); assertSame(doc, title.ownerDocument()); assertEquals(url + "?what=about+%26+me%3F", doc.location()); // with the query string } } static String echoSelect(String key) { return String.format("th:contains(%s) + td", key); } @Test public void doesPut() throws IOException { Connection.Response res = Jsoup.connect(echoUrl) .data("uname", "Jsoup", "uname", "Jonathan", "百", "度一下") .cookie("auth", "token") .method(Connection.Method.PUT) .execute(); Document doc = res.parse(); assertEquals("PUT", ihVal("Method", doc)); assertEquals("gzip", ihVal("Accept-Encoding", doc)); assertEquals("auth=token", ihVal("Cookie", doc)); } @Test public void doesDeleteWithBody() throws IOException { // https://github.com/jhy/jsoup/issues/1972 String body = "some body"; Connection.Response res = Jsoup.connect(echoUrl) .requestBody(body) .method(Method.DELETE) .execute(); Document doc = res.parse(); assertEquals("DELETE", ihVal("Method", doc)); assertEquals(body, ihVal("Post Data", doc)); } @Test public void doesDeleteWithoutBody() throws IOException { Connection.Response res = Jsoup.connect(echoUrl) .method(Method.DELETE) .execute(); Document doc = res.parse(); assertEquals("DELETE", ihVal("Method", doc)); assertEquals(null, ihVal("Post Data", doc)); } /** * Tests upload of content to a remote service. */ @ParameterizedTest @MethodSource("echoUrls") // http and https public void postFiles(String url) throws IOException { File thumb = ParseTest.getFile("/htmltests/thumb.jpg"); File html = ParseTest.getFile("/htmltests/large.html"); Document res = Jsoup .connect(url) .data("firstname", "Jay") .data("firstPart", thumb.getName(), Files.newInputStream(thumb.toPath()), "image/jpeg") .data("secondPart", html.getName(), Files.newInputStream(html.toPath())) // defaults to "application-octetstream"; .data("surname", "Soup") .post(); assertEquals("4", ihVal("Parts", res)); assertEquals("application/octet-stream", ihVal("Part secondPart ContentType", res)); assertEquals("secondPart", ihVal("Part secondPart Name", res)); assertEquals("large.html", ihVal("Part secondPart Filename", res)); assertEquals("280735", ihVal("Part secondPart Size", res)); assertEquals("image/jpeg", ihVal("Part firstPart ContentType", res)); assertEquals("firstPart", ihVal("Part firstPart Name", res)); assertEquals("thumb.jpg", ihVal("Part firstPart Filename", res)); assertEquals("1052", ihVal("Part firstPart Size", res)); assertEquals("Jay", ihVal("firstname", res)); assertEquals("Soup", ihVal("surname", res)); /* <tr><th>Part secondPart ContentType</th><td>application/octet-stream</td></tr> <tr><th>Part secondPart Name</th><td>secondPart</td></tr> <tr><th>Part secondPart Filename</th><td>google-ipod.html</td></tr> <tr><th>Part secondPart Size</th><td>43972</td></tr> <tr><th>Part firstPart ContentType</th><td>image/jpeg</td></tr> <tr><th>Part firstPart Name</th><td>firstPart</td></tr> <tr><th>Part firstPart Filename</th><td>thumb.jpg</td></tr> <tr><th>Part firstPart Size</th><td>1052</td></tr> */ } @Test public void multipleParsesOkAfterReadFully() throws IOException { Connection.Response res = Jsoup.connect(echoUrl).execute().readFully(); Document doc = res.parse(); assertTrue(doc.title().contains("Environment")); Document doc2 = res.parse(); assertTrue(doc2.title().contains("Environment")); } @Test public void multipleParsesOkAfterBufferUp() throws IOException { Connection.Response res = Jsoup.connect(echoUrl).execute().bufferUp(); Document doc = res.parse(); assertTrue(doc.title().contains("Environment")); Document doc2 = res.parse(); assertTrue(doc2.title().contains("Environment")); } @Test public void bodyAfterParseThrowsValidationError() { assertThrows(IllegalArgumentException.class, () -> { Connection.Response res = Jsoup.connect(echoUrl).execute(); Document doc = res.parse(); String body = res.body(); }); } @Test public void bodyAndBytesAvailableBeforeParse() throws IOException { Connection.Response res = Jsoup.connect(echoUrl).execute(); String body = res.body(); assertTrue(body.contains("Environment")); byte[] bytes = res.bodyAsBytes(); assertTrue(bytes.length > 100); Document doc = res.parse(); assertTrue(doc.title().contains("Environment")); } @Test public void parseParseThrowsValidates() { assertThrows(IllegalArgumentException.class, () -> { Connection.Response res = Jsoup.connect(echoUrl).execute(); Document doc = res.parse(); assertTrue(doc.title().contains("Environment")); Document doc2 = res.parse(); // should blow up because the response input stream has been drained }); } @Test public void multiCookieSet() throws IOException { Connection con = Jsoup .connect(RedirectServlet.Url) .data(RedirectServlet.CodeParam, "302") .data(RedirectServlet.SetCookiesParam, "true") .data(RedirectServlet.LocationParam, echoUrl); Connection.Response res = con.execute(); // test cookies set by redirect: Map<String, String> cookies = res.cookies(); assertEquals("asdfg123", cookies.get("token")); assertEquals("jhy", cookies.get("uid")); // two uids set, order dependent // send those cookies into the echo URL by map: Document doc = Jsoup.connect(echoUrl).cookies(cookies).get(); assertEquals("token=asdfg123; uid=jhy", ihVal("Cookie", doc)); } @Test public void requestCookiesSurviveRedirect() throws IOException { // this test makes sure that Request keyval cookies (not in the cookie store) are sent on subsequent redirections, // when not using the session method Connection con = Jsoup.connect(RedirectServlet.Url) .data(RedirectServlet.LocationParam, echoUrl) .cookie("LetMeIn", "True") .cookie("DoesItWork", "Yes"); Connection.Response res = con.execute(); assertEquals(0, res.cookies().size()); // were not set by Redir or Echo servlet Document doc = res.parse(); assertEquals(echoUrl, doc.location()); assertEquals("True", ihVal("Cookie: LetMeIn", doc)); assertEquals("Yes", ihVal("Cookie: DoesItWork", doc)); } @Test public void supportsDeflate() throws IOException { Connection.Response res = Jsoup.connect(DeflateServlet.Url).execute(); assertEquals("deflate", res.header("Content-Encoding")); Document doc = res.parse(); assertEquals("Hello, World!", doc.selectFirst("p").text()); } @Test public void handlesLargerContentLengthParseRead() throws IOException { // this handles situations where the remote server sets a content length greater than it actually writes Connection.Response res = Jsoup.connect(InterruptedServlet.Url) .data(InterruptedServlet.Magnitude, InterruptedServlet.Larger) .timeout(400) .execute(); try { Document document = res.parse(); assertEquals("Something", document.title()); assertEquals(0, document.select("p").size()); } catch (IOException ignored) { // HttpUrlConnection will read the amount provided and the tests in the try will pass // HttpClient will throw unexpected EOF during the read, and this will catch it // Either are OK } // current impl, jetty won't write past content length // todo - find way to trick jetty into writing larger than set header. Take over the stream? } @Test public void handlesWrongContentLengthDuringBufferedRead() throws IOException { Connection.Response res = Jsoup.connect(InterruptedServlet.Url) .timeout(400) .execute(); // this servlet writes max_buffer data, but sets content length to max_buffer/2. So will read up to that. // previous versions of jetty would allow to write less, and would throw except here res.bufferUp(); Document doc = res.parse(); assertEquals(0, doc.select("p").size()); } @Test public void handlesRedirect() throws IOException { Document doc = Jsoup.connect(RedirectServlet.Url) .data(RedirectServlet.LocationParam, HelloServlet.Url) .get(); Element p = doc.selectFirst("p"); assertEquals("Hello, World!", p.text()); assertEquals(HelloServlet.Url, doc.location()); } @Test public void handlesEmptyRedirect() { boolean threw = false; try { Connection.Response res = Jsoup.connect(RedirectServlet.Url) .execute(); } catch (IOException e) { assertTrue(e.getMessage().contains("Too many redirects")); threw = true; } assertTrue(threw); } @Test public void doesNotPostFor302() throws IOException { final Document doc = Jsoup.connect(RedirectServlet.Url) .data("Hello", "there") .data(RedirectServlet.LocationParam, EchoServlet.Url) .post(); assertEquals(EchoServlet.Url, doc.location()); assertEquals("GET", ihVal("Method", doc)); assertNull(ihVal("Hello", doc)); // data not sent } @Test public void doesPostFor307() throws IOException { final Document doc = Jsoup.connect(RedirectServlet.Url) .data("Hello", "there") .data(RedirectServlet.LocationParam, EchoServlet.Url) .data(RedirectServlet.CodeParam, "307") .post(); assertEquals(EchoServlet.Url, doc.location()); assertEquals("POST", ihVal("Method", doc)); assertEquals("there", ihVal("Hello", doc)); } @Test public void getUtf8Bom() throws IOException { Connection con = Jsoup.connect(FileServlet.urlTo("/bomtests/bom_utf8.html")); Document doc = con.get(); assertEquals("UTF-8", con.response().charset()); assertEquals("OK", doc.title()); } @Test public void streamerGetUtf8Bom() throws IOException { Connection con = Jsoup.connect(FileServlet.urlTo("/bomtests/bom_utf8.html")); Document doc = con.execute().streamParser().complete(); assertEquals("UTF-8", con.response().charset()); assertEquals("OK", doc.title()); } @Test public void testBinaryContentTypeThrowsException() throws IOException { Connection con = Jsoup.connect(FileServlet.urlTo("/htmltests/thumb.jpg")); con.data(FileServlet.ContentTypeParam, "image/jpeg"); boolean threw = false; try { con.execute(); Document doc = con.response().parse(); } catch (UnsupportedMimeTypeException e) { threw = true; assertEquals("Unhandled content type. Must be text/*, */xml, or */*+xml", e.getMessage()); } assertTrue(threw); } @Test public void testParseRss() throws IOException { // test that we switch automatically to xml, and we support application/rss+xml Connection con = Jsoup.connect(FileServlet.urlTo("/htmltests/test-rss.xml")); con.data(FileServlet.ContentTypeParam, "application/rss+xml"); Document doc = con.get(); Element title = doc.selectFirst("title"); assertNotNull(title); assertEquals("jsoup RSS news", title.text()); assertEquals("channel", title.parent().nodeName()); assertEquals("", doc.title()); // the document title is unset, this tag is channel>title, not html>head>title assertEquals(3, doc.select("link").size()); assertEquals("application/rss+xml", con.response().contentType()); assertTrue(doc.parser().getTreeBuilder() instanceof XmlTreeBuilder); assertEquals(Document.OutputSettings.Syntax.xml, doc.outputSettings().syntax()); } @Test public void testSupplyParserToConnection() throws IOException { String xmlUrl = FileServlet.urlTo("/htmltests/xml-test.xml"); // parse with both xml and html parser, ensure different Document xmlDoc = Jsoup.connect(xmlUrl).parser(Parser.xmlParser()).get(); Document htmlDoc = Jsoup.connect(xmlUrl).parser(Parser.htmlParser()).get(); Document autoXmlDoc = Jsoup.connect(xmlUrl).get(); // check connection auto detects xml, uses xml parser assertEquals("<doc><val>One<val>Two</val>Three</val></doc>", TextUtil.stripNewlines(xmlDoc.html())); assertNotEquals(htmlDoc, xmlDoc); assertFalse(htmlDoc.hasSameValue(xmlDoc)); assertTrue(xmlDoc.hasSameValue(autoXmlDoc)); assertEquals(1, htmlDoc.select("head").size()); // html parser normalises assertEquals(0, xmlDoc.select("head").size()); // xml parser does not assertEquals(0, autoXmlDoc.select("head").size()); // xml parser does not } @Test public void imageXmlMimeType() throws IOException { // test that we switch to XML, and that we support image/svg+xml String mimetype = "image/svg+xml"; Connection con = Jsoup.connect(FileServlet.urlTo("/htmltests/osi-logo.svg")) .data(FileServlet.ContentTypeParam, mimetype); Document doc = con.get(); assertEquals(mimetype, con.response().contentType()); assertTrue(doc.parser().getTreeBuilder() instanceof XmlTreeBuilder); assertEquals(Document.OutputSettings.Syntax.xml, doc.outputSettings().syntax()); Node firstChild = doc.firstChild(); XmlDeclaration decl = (XmlDeclaration) firstChild; assertEquals("no", decl.attr("standalone")); Element svg = doc.expectFirst("svg"); Element flowRoot = svg.expectFirst("flowRoot"); assertEquals("flowRoot", flowRoot.tagName()); assertEquals("preserve", flowRoot.attr("xml:space")); } @Test public void canFetchBinaryAsBytes() throws IOException { String path = "/htmltests/thumb.jpg"; int actualSize = 1052; Connection.Response res = Jsoup.connect(FileServlet.urlTo(path)) .data(FileServlet.ContentTypeParam, "image/jpeg") .ignoreContentType(true) .execute(); byte[] resBytes = res.bodyAsBytes(); assertEquals(actualSize, resBytes.length); // compare the content of the file and the bytes: Path filePath = ParseTest.getPath(path); byte[] fileBytes = Files.readAllBytes(filePath); assertEquals(actualSize, fileBytes.length); assertArrayEquals(fileBytes, resBytes); } @Test public void handlesUnknownEscapesAcrossBuffer() throws IOException { String localPath = "/htmltests/escapes-across-buffer.html"; String localUrl = FileServlet.urlTo(localPath); Document docFromLocalServer = Jsoup.connect(localUrl).get(); Document docFromFileRead = Jsoup.parse(ParseTest.getFile(localPath), "UTF-8"); String text = docFromLocalServer.body().text(); assertEquals(14766, text.length()); assertEquals(text, docFromLocalServer.body().text()); assertEquals(text, docFromFileRead.body().text()); } /** * Test fetching a form, and submitting it with a file attached. */ @Test public void postHtmlFile() throws IOException { Document index = Jsoup.connect(FileServlet.urlTo("/htmltests/upload-form.html")).get(); List<FormElement> forms = index.select("[name=tidy]").forms(); assertEquals(1, forms.size()); FormElement form = forms.get(0); Connection post = form.submit(); File uploadFile = ParseTest.getFile("/htmltests/large.html"); FileInputStream stream = new FileInputStream(uploadFile); Connection.KeyVal fileData = post.data("_file"); assertNotNull(fileData); fileData.value("check.html"); fileData.inputStream(stream); Connection.Response res; try { res = post.execute(); } finally { stream.close(); } Document doc = res.parse(); assertEquals(ihVal("Method", doc), "POST"); // from form action assertEquals(ihVal("Part _file Filename", doc), "check.html"); assertEquals(ihVal("Part _file Name", doc), "_file"); assertEquals(ihVal("_function", doc), "tidy"); } @Test public void fetchHandlesXml() throws IOException { String[] types = {"text/xml", "application/xml", "application/rss+xml", "application/xhtml+xml"}; for (String type : types) { fetchHandlesXml(type); } } void fetchHandlesXml(String contentType) throws IOException { // should auto-detect xml and use XML parser, unless explicitly requested the html parser String xmlUrl = FileServlet.urlTo("/htmltests/xml-test.xml"); Connection con = Jsoup.connect(xmlUrl); con.data(FileServlet.ContentTypeParam, contentType); Document doc = con.get(); Connection.Request req = con.request(); assertTrue(req.parser().getTreeBuilder() instanceof XmlTreeBuilder); assertEquals("<doc><val>One<val>Two</val>Three</val></doc>\n", doc.outerHtml()); assertEquals(con.response().contentType(), contentType); } @Test public void fetchHandlesXmlAsHtmlWhenParserSet() throws IOException { // should auto-detect xml and use XML parser, unless explicitly requested the html parser String xmlUrl = FileServlet.urlTo("/htmltests/xml-test.xml"); Connection con = Jsoup.connect(xmlUrl).parser(Parser.htmlParser()); // which will also use the pretty printer by default con.data(FileServlet.ContentTypeParam, "application/xml"); Document doc = con.get(); Connection.Request req = con.request(); assertTrue(req.parser().getTreeBuilder() instanceof HtmlTreeBuilder); assertEquals("<html>\n <head></head>\n <body>\n <doc>\n <val>\n One<val>Two</val>Three\n </val>\n </doc>\n </body>\n</html>", doc.outerHtml()); } @Test public void combinesSameHeadersWithComma() throws IOException { // http://www.w3.org/Protocols/rfc2616/rfc2616-sec4.html#sec4.2 Connection con = Jsoup.connect(echoUrl); con.get(); Connection.Response res = con.response(); assertEquals("text/html;charset=utf-8", res.header("Content-Type")); assertEquals("no-cache, no-store", res.header("Cache-Control")); List<String> header = res.headers("Cache-Control"); assertEquals(2, header.size()); assertEquals("no-cache", header.get(0)); assertEquals("no-store", header.get(1)); } @Test public void sendHeadRequest() throws IOException { String url = FileServlet.urlTo("/htmltests/xml-test.xml"); Connection con = Jsoup.connect(url) .method(Connection.Method.HEAD) .data(FileServlet.ContentTypeParam, "text/xml"); final Connection.Response response = con.execute(); assertEquals("text/xml", response.header("Content-Type")); assertEquals("", response.body()); // head ought to have no body Document doc = response.parse(); assertEquals("", doc.text()); } @Test public void fetchToW3c() throws IOException { String url = FileServlet.urlTo("/htmltests/upload-form.html"); Document doc = Jsoup.connect(url).get(); W3CDom dom = new W3CDom(); org.w3c.dom.Document wDoc = dom.fromJsoup(doc); assertEquals(url, wDoc.getDocumentURI()); String html = dom.asString(wDoc); assertTrue(html.contains("Upload")); } @Test public void baseHrefCorrectAfterHttpEquiv() throws IOException { // https://github.com/jhy/jsoup/issues/440 Connection.Response res = Jsoup.connect(FileServlet.urlTo("/htmltests/charset-base.html")).execute(); Document doc = res.parse(); assertEquals("http://example.com/foo.jpg", doc.select("img").first().absUrl("src")); } @Test public void maxBodySize() throws IOException { String url = FileServlet.urlTo("/htmltests/large.html"); // 280 K Connection.Response defaultRes = Jsoup.connect(url).execute(); Connection.Response smallRes = Jsoup.connect(url).maxBodySize(50 * 1024).execute(); // crops Connection.Response mediumRes = Jsoup.connect(url).maxBodySize(200 * 1024).execute(); // crops Connection.Response largeRes = Jsoup.connect(url).maxBodySize(300 * 1024).execute(); // does not crop Connection.Response unlimitedRes = Jsoup.connect(url).maxBodySize(0).execute(); int actualDocText = LargeDocTextLen; assertEquals(actualDocText, defaultRes.parse().text().length()); assertEquals(49165, smallRes.parse().text().length()); assertEquals(196577, mediumRes.parse().text().length()); assertEquals(actualDocText, largeRes.parse().text().length()); assertEquals(actualDocText, unlimitedRes.parse().text().length()); } @Test public void repeatable() throws IOException { String url = FileServlet.urlTo("/htmltests/large.html"); // 280 K Connection con = Jsoup.connect(url).parser(Parser.xmlParser()); Document doc1 = con.get(); Document doc2 = con.get(); assertEquals("Large HTML", doc1.title()); assertEquals("Large HTML", doc2.title()); } @Test public void maxBodySizeInReadToByteBuffer() throws IOException { // https://github.com/jhy/jsoup/issues/1774 // when calling readToByteBuffer, contents were not buffered up String url = FileServlet.urlTo("/htmltests/large.html"); // 280 K Connection.Response defaultRes = Jsoup.connect(url).execute(); Connection.Response smallRes = Jsoup.connect(url).maxBodySize(50 * 1024).execute(); // crops Connection.Response mediumRes = Jsoup.connect(url).maxBodySize(200 * 1024).execute(); // crops Connection.Response largeRes = Jsoup.connect(url).maxBodySize(300 * 1024).execute(); // does not crop Connection.Response unlimitedRes = Jsoup.connect(url).maxBodySize(0).execute(); int actualDocText = 280735; assertEquals(actualDocText, defaultRes.body().length()); assertEquals(50 * 1024, smallRes.body().length()); assertEquals(200 * 1024, mediumRes.body().length()); assertEquals(actualDocText, largeRes.body().length()); assertEquals(actualDocText, unlimitedRes.body().length()); assertEquals(actualDocText, defaultRes.readBody().length()); assertEquals(50 * 1024, smallRes.readBody().length()); assertEquals(200 * 1024, mediumRes.readBody().length()); assertEquals(actualDocText, largeRes.readBody().length()); assertEquals(actualDocText, unlimitedRes.readBody().length()); } @Test void formLoginFlow() throws IOException { String echoUrl = EchoServlet.Url; String cookieUrl = CookieServlet.Url; String startUrl = FileServlet.urlTo("/htmltests/form-tests.html"); Document loginDoc = Jsoup.connect(startUrl).get(); FormElement form = loginDoc.expectForm("#login"); assertNotNull(form); form.expectFirst("[name=username]").val("admin"); form.expectFirst("[name=password]").val("Netscape engineers are weenies!"); // post it- should go to Cookie then bounce to Echo Connection submit = form.submit(); assertEquals(Connection.Method.POST, submit.request().method()); Connection.Response postRes = submit.execute(); assertEquals(echoUrl, postRes.url().toExternalForm()); assertEquals(Connection.Method.GET, postRes.method()); Document resultDoc = postRes.parse(); assertEquals("One=EchoServlet; One=Root", ihVal("Cookie", resultDoc)); // should be no form data sent to the echo redirect assertEquals("", ihVal("Query String", resultDoc)); // new request to echo, should not have form data, but should have cookies from implicit session Document newEcho = submit.newRequest(echoUrl).get(); assertEquals("One=EchoServlet; One=Root", ihVal("Cookie", newEcho)); assertEquals("", ihVal("Query String", newEcho)); Document cookieDoc = submit.newRequest(cookieUrl).get(); assertEquals("CookieServlet", ihVal("One", cookieDoc)); // different cookie path } @Test void formLoginFlow2() throws IOException { String echoUrl = EchoServlet.Url; String cookieUrl = CookieServlet.Url; String startUrl = FileServlet.urlTo("/htmltests/form-tests.html"); Connection session = Jsoup.newSession(); Document loginDoc = session.newRequest(startUrl).get(); FormElement form = loginDoc.expectForm("#login2"); assertNotNull(form); String username = "admin"; form.expectFirst("[name=username]").val(username); String password = "Netscape engineers are weenies!"; form.expectFirst("[name=password]").val(password); Connection submit = form.submit(); assertEquals(username, submit.data("username").value()); assertEquals(password, submit.data("password").value()); Connection.Response postRes = submit.execute(); assertEquals(cookieUrl, postRes.url().toExternalForm()); assertEquals(Connection.Method.POST, postRes.method()); Document resultDoc = postRes.parse(); Document echo2 = resultDoc.connection().newRequest(echoUrl).get(); assertEquals("", ihVal("Query String", echo2)); // should not re-send the data assertEquals("One=EchoServlet; One=Root", ihVal("Cookie", echo2)); } @Test void preservesUrlFragment() throws IOException { // confirms https://github.com/jhy/jsoup/issues/1686 String url = EchoServlet.Url + "#fragment"; Document doc = Jsoup.connect(url).get(); assertEquals(url, doc.location()); } @Test void fetchUnicodeUrl() throws IOException { String url = EchoServlet.Url + "/✔/?鍵=値"; Document doc = Jsoup.connect(url).get(); assertEquals("/✔/", ihVal("Path Info", doc)); assertEquals("%E9%8D%B5=%E5%80%A4", ihVal("Query String", doc)); assertEquals("鍵=値", URLDecoder.decode(ihVal("Query String", doc), DataUtil.UTF_8.name())); } @Test void willEscapePathInRedirect() throws IOException { String append = "/Foo{bar}<>%/"; String url = EchoServlet.Url + append; Document doc = Jsoup .connect(RedirectServlet.Url) .data(RedirectServlet.LocationParam, url) .get(); String path = ihVal("Path Info", doc); assertEquals(append, path); assertEquals("/EchoServlet/Foo%7Bbar%7D%3C%3E%25/", ihVal("Request URI", doc)); } /** Provides HTTP and HTTPS EchoServlet URLs */ private static Stream<String> echoUrls() { return Stream.of(EchoServlet.Url, EchoServlet.TlsUrl); } @ParameterizedTest @MethodSource("echoUrls") void failsIfNotAuthenticated(String url) throws IOException { String password = AuthFilter.newServerPassword(); // we don't send it, but ensures cache won't hit Connection.Response res = Jsoup.connect(url) .header(AuthFilter.WantsServerAuthentication, "1") .ignoreHttpErrors(true) .execute(); assertEquals(401, res.statusCode()); } @ParameterizedTest @MethodSource("echoUrls") void canAuthenticate(String url) throws IOException { AtomicInteger count = new AtomicInteger(0); String password = AuthFilter.newServerPassword(); Connection.Response res = Jsoup.connect(url) .header(AuthFilter.WantsServerAuthentication, "1") .auth(ctx -> { count.incrementAndGet(); assertEquals(Authenticator.RequestorType.SERVER, ctx.type()); assertEquals("localhost", ctx.url().getHost()); assertEquals(AuthFilter.ServerRealm, ctx.realm()); return ctx.credentials(AuthFilter.ServerUser, password); }) .execute(); assertEquals(1, count.get()); Document doc = res.parse(); assertTrue(ihVal("Authorization", doc).startsWith("Basic ")); // tests we set the auth header } @ParameterizedTest @MethodSource("echoUrls") void incorrectAuth(String url) throws IOException { Connection session = Jsoup.newSession() .header(AuthFilter.WantsServerAuthentication, "1") .ignoreHttpErrors(true); String password = AuthFilter.newServerPassword(); int code = session.newRequest(url).execute().statusCode(); // no auth sent assertEquals(HttpServletResponse.SC_UNAUTHORIZED, code); AtomicInteger count = new AtomicInteger(0); try { Connection.Response res = session.newRequest(url) .auth(ctx -> { count.incrementAndGet(); return ctx.credentials(AuthFilter.ServerUser, password + "wrong"); // incorrect }) .execute(); assertEquals(HttpServletResponse.SC_UNAUTHORIZED, res.statusCode()); } catch (IOException e) { assertEquals("No credentials provided", e.getMessage()); // In HttpClient, will throw IOE if our password delegate stops providing credentials after too many attempts. So we'll get this error. // In HttpUrlConnection, which would otherwise try 20 times, when the auth stops providing it will cascade to the underyling 401 response (which seems a better path IMO) } assertEquals(MaxAttempts, count.get()); AtomicInteger successCount = new AtomicInteger(0); Connection.Response successRes = session.newRequest(url) .auth(ctx -> { successCount.incrementAndGet(); return ctx.credentials(AuthFilter.ServerUser, password); // correct }) .execute(); assertEquals(1, successCount.get()); assertEquals(HttpServletResponse.SC_OK, successRes.statusCode()); } // proxy connection tests are in ProxyTest @ParameterizedTest @ValueSource(strings = { "/htmltests/large.html", "/htmltests/large.html?" + FileServlet.SuppressContentLength }) void progressListener(String path) throws IOException { String url = FileServlet.urlTo(path); boolean knownContentLength = !url.contains(FileServlet.SuppressContentLength); AtomicBoolean seenProgress = new AtomicBoolean(false); AtomicBoolean completed = new AtomicBoolean(false); AtomicInteger numProgress = new AtomicInteger(); Connection con = Jsoup.connect(url).onResponseProgress((processed, total, percent, response) -> { //System.out.println("Processed: " + processed + " of " + total + " (" + percent + "%)"); if (!seenProgress.get()) { seenProgress.set(true); assertEquals(0, processed); assertEquals(knownContentLength ? LargeDocFileLen : -1, total); assertEquals(0.0f, percent); assertEquals(200, response.statusCode()); String contentLength = response.header("Content-Length"); if (knownContentLength) { assertNotNull(contentLength); assertEquals(String.valueOf(LargeDocFileLen), contentLength); } else { assertNull(contentLength); } assertEquals(url, response.url().toExternalForm()); } numProgress.getAndIncrement(); if (percent == 100.0f) { // even if the content-length is not set, we get 100% when the read is completed completed.set(true); assertEquals(LargeDocFileLen, processed); } }); Document document = con.get(); assertTrue(seenProgress.get()); assertTrue(completed.get()); // should expect to see events relative to how large the buffer is. int expected = LargeDocFileLen / 8192; int num = numProgress.get(); // debug log if not in those ranges: if (num < expected * 0.75 || num > expected * 2.5) { System.err.println("Expected: " + expected + ", got: " + num); } assertTrue(num > expected * 0.75); assertTrue(num < expected * 2.5); // check the document works assertEquals(LargeDocTextLen, document.text().length()); } } ================================================ FILE: src/test/java/org/jsoup/integration/FuzzFixesIT.java ================================================ package org.jsoup.integration; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.MethodSource; import java.io.File; import java.io.IOException; import java.util.stream.Stream; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; /** Tests fixes for issues raised by the <a href="https://oss-fuzz.com/testcases?project=jsoup">OSS Fuzz project</a>. As some of these are timeout tests - run each file 100 times and ensure under time. */ public class FuzzFixesIT { static int numIters = 50; static int timeout = 30; // external fuzzer is set to 60 for 100 runs static File testDir = ParseTest.getFile("/fuzztests/"); private static Stream<File> testFiles() { File[] files = testDir.listFiles(); assertNotNull(files); assertTrue(files.length > 10); return Stream.of(files); } @Disabled // disabled, as these soak up build time and the outcome oughtn't change unless we are refactoring the tree builders. manually execute as desired. @ParameterizedTest @MethodSource("testFiles") void testHtmlParse(File file) throws IOException { long startTime = System.currentTimeMillis(); long completeBy = startTime + timeout * 1000L; for (int i = 0; i < numIters; i++) { Document doc = Jsoup.parse(file, "UTF-8", "https://example.com/"); assertNotNull(doc); if (System.currentTimeMillis() > completeBy) Assertions.fail(String.format("Timeout: only completed %d iters of [%s] in %d seconds", i, file.getName(), timeout)); } } @Disabled // disabled, as these soak up build time and the outcome oughtn't change unless we are refactoring the tree builders. manually execute as desired. @ParameterizedTest @MethodSource("testFiles") void testXmlParse(File file) throws IOException { long startTime = System.currentTimeMillis(); long completeBy = startTime + timeout * 1000L; for (int i = 0; i < numIters; i++) { Document doc = Jsoup.parse(file, "UTF-8", "https://example.com/", Parser.xmlParser()); assertNotNull(doc); if (System.currentTimeMillis() > completeBy) Assertions.fail(String.format("Timeout: only completed %d iters of [%s] in %d seconds", i, file.getName(), timeout)); } } } ================================================ FILE: src/test/java/org/jsoup/integration/FuzzFixesTest.java ================================================ package org.jsoup.integration; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.parser.Parser; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.MethodSource; import java.io.File; import java.io.IOException; import java.util.stream.Stream; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; /** Tests fixes for issues raised by the OSS Fuzz project @ https://oss-fuzz.com/testcases?project=jsoup. Contains inline string cases causing exceptions. Timeout tests are in FuzzFixesIT. */ public class FuzzFixesTest { private static Stream<File> testFiles() { File[] files = FuzzFixesIT.testDir.listFiles(); assertNotNull(files); assertTrue(files.length > 10); return Stream.of(files); } @Test public void blankAbsAttr() { // https://github.com/jhy/jsoup/issues/1541 String html = "b<bodY abs: abs:abs: abs:abs:abs>"; Document doc = Jsoup.parse(html); assertNotNull(doc); } @Test public void bookmark() { // https://github.com/jhy/jsoup/issues/1576 String html = "<?a<U<P<A "; Document doc = Jsoup.parse(html); assertNotNull(doc); Document xmlDoc = Parser.xmlParser().parseInput(html, ""); assertNotNull(xmlDoc); } @Test void fragment() { Parser.htmlParser().parseFragmentInput("<frameset>>l\u0000<\u0000<ditl>\u0000< \\", new Element("colgroup"), ""); } @ParameterizedTest @MethodSource("testFiles") void testHtmlParse(File file) throws IOException { Document doc = Jsoup.parse(file, "UTF-8", "https://example.com/"); assertNotNull(doc); doc = Jsoup.parse(file, "UTF-8", ""); // no base href attr; so same as a parse(string), which can have subtly different semantics assertNotNull(doc); } @ParameterizedTest @MethodSource("testFiles") void testHtmlFragmentParse(File file) throws IOException { String html = ParseTest.getFileAsString(file); Document doc = Jsoup.parseBodyFragment(html); assertNotNull(doc); } @ParameterizedTest @MethodSource("testFiles") void testXmlParse(File file) throws IOException { Document doc = Jsoup.parse(file, "UTF-8", "https://example.com/", Parser.xmlParser()); assertNotNull(doc); doc = Jsoup.parse(file, "UTF-8", "", Parser.xmlParser()); // no base href attr } } ================================================ FILE: src/test/java/org/jsoup/integration/ParseTest.java ================================================ package org.jsoup.integration; import org.jsoup.Jsoup; import org.jsoup.helper.DataUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.parser.ParseErrorList; import org.jsoup.parser.Parser; import org.jsoup.select.Elements; import org.junit.jupiter.api.Test; import java.io.*; import java.net.URISyntaxException; import java.net.URL; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.zip.GZIPInputStream; import static org.junit.jupiter.api.Assertions.*; /** * Integration test: parses from real-world example HTML. * * @author Jonathan Hedley, jonathan@hedley.net */ public class ParseTest { @Test public void testHtml5Charset() throws IOException { // test that <meta charset="gb2312"> works File in = getFile("/htmltests/meta-charset-1.html"); Document doc = Jsoup.parse(in, null, "http://example.com/"); //gb2312, has html5 <meta charset> assertEquals("新", doc.text()); assertEquals("GB2312", doc.outputSettings().charset().displayName()); // double check, no charset, falls back to utf8 which is incorrect in = getFile("/htmltests/meta-charset-2.html"); // doc = Jsoup.parse(in, null, "http://example.com"); // gb2312, no charset assertEquals("UTF-8", doc.outputSettings().charset().displayName()); assertNotEquals("新", doc.text()); // confirm fallback to utf8 in = getFile("/htmltests/meta-charset-3.html"); doc = Jsoup.parse(in, null, "http://example.com/"); // utf8, no charset assertEquals("UTF-8", doc.outputSettings().charset().displayName()); assertEquals("新", doc.text()); } @Test public void testBrokenHtml5CharsetWithASingleDoubleQuote() throws IOException { InputStream in = inputStreamFrom("<html>\n" + "<head><meta charset=UTF-8\"></head>\n" + "<body></body>\n" + "</html>"); Document doc = Jsoup.parse(in, null, "http://example.com/"); assertEquals("UTF-8", doc.outputSettings().charset().displayName()); } @Test public void testLowercaseUtf8Charset() throws IOException { File in = getFile("/htmltests/lowercase-charset-test.html"); Document doc = Jsoup.parse(in, null); Element form = doc.select("#form").first(); assertEquals(2, form.children().size()); assertEquals("UTF-8", doc.outputSettings().charset().name()); } @Test public void testXwiki() throws IOException { // https://github.com/jhy/jsoup/issues/1324 // this tests that when in CharacterReader we hit a buffer while marked, we preserve the mark when buffered up and can rewind File in = getFile("/htmltests/xwiki-1324.html.gz"); Document doc = Jsoup.parse(in, null, "https://localhost/"); assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text()); // was getting busted at =userdirectory, because it hit the bufferup point but the mark was then lost. so // updated to preserve the mark. String wantHtml = "<a class=\"list-group-item\" data-id=\"userdirectory\" href=\"/xwiki/bin/admin/XWiki/XWikiPreferences?editor=globaladmin&section=userdirectory\" title=\"Customize the user directory live table.\">User Directory</a>"; assertEquals(wantHtml, doc.select("[data-id=userdirectory]").outerHtml()); } @Test public void testXwikiExpanded() throws IOException { // https://github.com/jhy/jsoup/issues/1324 // this tests that if there is a huge illegal character reference, we can get through a buffer and rewind, and still catch that it's an invalid refence, // and the parse tree is correct. File in = getFile("/htmltests/xwiki-edit.html.gz"); Parser parser = Parser.htmlParser(); Document doc = Jsoup.parse(new GZIPInputStream(new FileInputStream(in)), "UTF-8", "https://localhost/", parser.setTrackErrors(100)); ParseErrorList errors = parser.getErrors(); assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text()); assertEquals(0, errors.size()); // not an invalid reference because did not look legit // was getting busted at =userdirectory, because it hit the bufferup point but the mark was then lost. so // updated to preserve the mark. String wantHtml = "<a class=\"list-group-item\" data-id=\"userdirectory\" href=\"/xwiki/bin/admin/XWiki/XWikiPreferences?editor=globaladmin&RIGHTHERERIGHTHERERIGHTHERERIGHTHERE"; assertTrue(doc.select("[data-id=userdirectory]").outerHtml().startsWith(wantHtml)); } @Test public void testWikiExpandedFromString() throws IOException { File in = getFile("/htmltests/xwiki-edit.html.gz"); String html = getFileAsString(in); Document doc = Jsoup.parse(html); assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text()); String wantHtml = "<a class=\"list-group-item\" data-id=\"userdirectory\" href=\"/xwiki/bin/admin/XWiki/XWikiPreferences?editor=globaladmin&RIGHTHERERIGHTHERERIGHTHERERIGHTHERE"; assertTrue(doc.select("[data-id=userdirectory]").outerHtml().startsWith(wantHtml)); } @Test public void testWikiFromString() throws IOException { File in = getFile("/htmltests/xwiki-1324.html.gz"); String html = getFileAsString(in); Document doc = Jsoup.parse(html); assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text()); String wantHtml = "<a class=\"list-group-item\" data-id=\"userdirectory\" href=\"/xwiki/bin/admin/XWiki/XWikiPreferences?editor=globaladmin&section=userdirectory\" title=\"Customize the user directory live table.\">User Directory</a>"; assertEquals(wantHtml, doc.select("[data-id=userdirectory]").outerHtml()); } @Test public void testFileParseNoCharsetMethod() throws IOException { File in = getFile("/htmltests/xwiki-1324.html.gz"); Document doc = Jsoup.parse(in); assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text()); } public static File getFile(String resourceName) { try { URL resource = ParseTest.class.getResource(resourceName); return resource != null ? new File(resource.toURI()) : new File("/404"); } catch (URISyntaxException e) { throw new IllegalStateException(e); } } public static Path getPath(String resourceName) { try { URL resource = ParseTest.class.getResource(resourceName); return resource != null ? Paths.get(resource.toURI()) : Paths.get("/404"); } catch (URISyntaxException e) { throw new IllegalStateException(e); } } public static InputStream inputStreamFrom(String s) { return new ByteArrayInputStream(s.getBytes(StandardCharsets.UTF_8)); } public static String getFileAsString(File file) throws IOException { byte[] bytes; if (file.getName().endsWith(".gz")) { InputStream stream = new GZIPInputStream(new FileInputStream(file)); ByteBuffer byteBuffer = DataUtil.readToByteBuffer(stream, 0); bytes = new byte[byteBuffer.limit()]; System.arraycopy(byteBuffer.array(), 0, bytes, 0, byteBuffer.limit()); } else { bytes = Files.readAllBytes(file.toPath()); } return new String(bytes); } } ================================================ FILE: src/test/java/org/jsoup/integration/ProxyTest.java ================================================ package org.jsoup.integration; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.integration.servlets.AuthFilter; import org.jsoup.integration.servlets.EchoServlet; import org.jsoup.integration.servlets.FileServlet; import org.jsoup.integration.servlets.HelloServlet; import org.jsoup.integration.servlets.ProxyServlet; import org.jsoup.integration.servlets.RedirectServlet; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.MethodSource; import javax.servlet.http.HttpServletResponse; import java.io.IOException; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Stream; import static org.jsoup.helper.AuthenticationHandlerTest.MaxAttempts; import static org.jsoup.integration.ConnectTest.ihVal; import static org.junit.jupiter.api.Assertions.*; /** Tests Jsoup.connect proxy support */ public class ProxyTest { private static String echoUrl; private static TestServer.ProxySettings proxy; @BeforeAll public static void setUp() { echoUrl = EchoServlet.Url; proxy = ProxyServlet.ProxySettings; } @ParameterizedTest @MethodSource("helloUrls") void fetchViaProxy(String url) throws IOException { Connection con = Jsoup.connect(url) .proxy(proxy.hostname, proxy.port); Connection.Response res = con.execute(); if (url.startsWith("http:/")) assertVia(res); // HTTPS CONNECT won't have Via Document doc = res.parse(); Element p = doc.expectFirst("p"); assertEquals("Hello, World!", p.text()); } private static Stream<String> helloUrls() { return Stream.of(HelloServlet.Url, HelloServlet.TlsUrl); } private static Stream<String> echoUrls() { return Stream.of(EchoServlet.Url, EchoServlet.TlsUrl); } private static void assertVia(Connection.Response res) { assertEquals(res.header("Via"), ProxyServlet.Via); } @Test void redirectViaProxy() throws IOException { Connection.Response res = Jsoup .connect(RedirectServlet.Url) .data(RedirectServlet.LocationParam, echoUrl) .header("Random-Header-name", "hello") .proxy(proxy.hostname, proxy.port) .execute(); assertVia(res); Document doc = res.parse(); assertEquals(echoUrl, doc.location()); assertEquals("hello", ihVal("Random-Header-name", doc)); assertVia(res); } @Test void proxyForSession() throws IOException { Connection session = Jsoup.newSession().proxy(proxy.hostname, proxy.port); Connection.Response medRes = session.newRequest(FileServlet.urlTo("/htmltests/medium.html")).execute(); Connection.Response largeRes = session.newRequest(FileServlet.urlTo("/htmltests/large.html")).execute(); assertVia(medRes); assertVia(largeRes); assertEquals("Medium HTML", medRes.parse().title()); assertEquals("Large HTML", largeRes.parse().title()); Connection.Response smedRes = session.newRequest(FileServlet.tlsUrlTo("/htmltests/medium.html")).execute(); Connection.Response slargeRes = session.newRequest(FileServlet.tlsUrlTo("/htmltests/large.html")).execute(); assertEquals("Medium HTML", smedRes.parse().title()); assertEquals("Large HTML", slargeRes.parse().title()); } @ParameterizedTest @MethodSource("echoUrls") void canAuthenticateToProxy(String url) throws IOException { int closed = TestServer.closeAuthedProxyConnections(); // reset any existing authed connections from previous tests, so we can test the auth flow // the proxy wants auth, but not the server. HTTP and HTTPS, so tests direct proxy and CONNECT Connection session = Jsoup.newSession() .proxy(proxy.hostname, proxy.authedPort) .ignoreHttpErrors(true) .ignoreContentType(true); // ignore content type, as error served may not have a content type String password = AuthFilter.newProxyPassword(); // fail first try { Connection.Response execute = session.newRequest(url) .execute(); int code = execute.statusCode(); // no auth sent assertEquals(HttpServletResponse.SC_PROXY_AUTHENTICATION_REQUIRED, code); } catch (IOException e) { assertAuthRequiredException(e); } try { AtomicInteger count = new AtomicInteger(0); Connection.Response res = session.newRequest(url) .auth(ctx -> { count.incrementAndGet(); return ctx.credentials(AuthFilter.ProxyUser, password + "wrong"); // incorrect }) .execute(); assertEquals(MaxAttempts, count.get()); assertEquals(HttpServletResponse.SC_PROXY_AUTHENTICATION_REQUIRED, res.statusCode()); } catch (IOException e) { assertAuthRequiredException(e); } AtomicInteger successCount = new AtomicInteger(0); Connection.Response successRes = session.newRequest(url) .auth(ctx -> { successCount.incrementAndGet(); return ctx.credentials(AuthFilter.ProxyUser, password); // correct }) .execute(); assertEquals(1, successCount.get()); assertEquals(HttpServletResponse.SC_OK, successRes.statusCode()); } static void assertAuthRequiredException(IOException e) { // in CONNECT (for the HTTPS url), URLConnection will throw the proxy connect as a Stringly typed IO exception - "Unable to tunnel through proxy. Proxy returns "HTTP/1.1 407 Proxy Authentication Required"". (Not a response code) // Alternatively, some platforms (?) will report: "No credentials provided" String err = e.getMessage(); if (!(err.contains("407") || err.contains("No credentials provided") || err.contains("exch.exchImpl"))) { // https://github.com/jhy/jsoup/pull/2403 - Ubuntu Azul 25 throws `Cannot invoke "jdk.internal.net.http.ExchangeImpl.cancel(java.io.IOException)" because "exch.exchImpl" is null` here but is just from cancelling the 407 req System.err.println("Not a 407 exception? " + e.getClass()); e.printStackTrace(System.err); fail("Expected 407 Proxy Authentication Required, got: " + err); } } @ParameterizedTest @MethodSource("echoUrls") void canAuthToProxyAndServer(String url) throws IOException { String serverPassword = AuthFilter.newServerPassword(); String proxyPassword = AuthFilter.newProxyPassword(); AtomicInteger count = new AtomicInteger(0); Connection session = Jsoup.newSession() // both proxy and server will want auth .proxy(proxy.hostname, proxy.authedPort) .header(AuthFilter.WantsServerAuthentication, "1") .auth(auth -> { count.incrementAndGet(); if (auth.isServer()) { assertEquals(url, auth.url().toString()); assertEquals(AuthFilter.ServerRealm, auth.realm()); return auth.credentials(AuthFilter.ServerUser, serverPassword); } else { assertTrue(auth.isProxy()); return auth.credentials(AuthFilter.ProxyUser, proxyPassword); } }); Connection.Response res = session.newRequest(url).execute(); assertEquals(200, res.statusCode()); assertEquals(2, count.get()); // hit server and proxy auth stages assertEquals("Webserver Environment Variables", res.parse().title()); } } ================================================ FILE: src/test/java/org/jsoup/integration/SafelistExtensionTest.java ================================================ package org.jsoup.integration; import org.jsoup.Jsoup; import org.jsoup.TextUtil; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Element; import org.jsoup.safety.Safelist; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertEquals; /** Check that we can extend Safelist methods */ public class SafelistExtensionTest { @Test public void canCustomizeSafeTests() { OpenSafelist openSafelist = new OpenSafelist(Safelist.relaxed()); Safelist safelist = Safelist.relaxed(); String html = "<p><opentag openattr>Hello</opentag></p>"; String openClean = Jsoup.clean(html, openSafelist); String clean = Jsoup.clean(html, safelist); assertEquals("<p><opentag openattr=\"\">Hello</opentag></p>", TextUtil.stripNewlines(openClean)); assertEquals("<p>Hello</p>", clean); } // passes tags and attributes starting with "open" private static class OpenSafelist extends Safelist { public OpenSafelist(Safelist safelist) { super(safelist); } @Override public boolean isSafeAttribute(String tagName, Element el, Attribute attr) { if (attr.getKey().startsWith("open")) return true; return super.isSafeAttribute(tagName, el, attr); } @Override public boolean isSafeTag(String tag) { if (tag.startsWith("open")) return true; return super.isSafeTag(tag); } } } ================================================ FILE: src/test/java/org/jsoup/integration/SessionIT.java ================================================ package org.jsoup.integration; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.integration.servlets.FileServlet; import org.jsoup.integration.servlets.SlowRider; import org.jsoup.nodes.Document; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import java.io.IOException; import java.io.UncheckedIOException; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import static org.junit.jupiter.api.Assertions.*; /** Integration tests to test longer running Connection */ public class SessionIT { @BeforeAll public static void setUp() { TestServer.start(); } @Test public void multiThread() throws InterruptedException { int numThreads = 20; int numThreadLoops = 5; String[] urls = { FileServlet.urlTo("/htmltests/medium.html"), FileServlet.urlTo("/htmltests/upload-form.html"), FileServlet.urlTo("/htmltests/comments.html"), FileServlet.urlTo("/htmltests/large.html"), }; String[] titles = { "Medium HTML", "Upload Form Test", "A Certain Kind of Test", "Large HTML" }; ThreadCatcher catcher = new ThreadCatcher(); Connection session = Jsoup.newSession(); Thread[] threads = new Thread[numThreads]; for (int threadNum = 0; threadNum < numThreads; threadNum++) { Thread thread = new Thread(() -> { for (int loop = 0; loop < numThreadLoops; loop++) { for (int i = 0; i < urls.length; i++) { try { Document doc = session.newRequest().url(urls[i]).get(); assertEquals(titles[i], doc.title()); } catch (IOException e) { throw new UncheckedIOException(e); } } } }); thread.setName("Runner-" + threadNum); thread.start(); thread.setUncaughtExceptionHandler(catcher); threads[threadNum] = thread; } // now join them all for (Thread thread : threads) { thread.join(); } assertEquals(0, catcher.exceptionCount.get()); } // test that we throw a nice clear exception if you try to multi-thread by forget .newRequest() @Test public void multiThreadWithoutNewRequestBlowsUp() throws InterruptedException { int numThreads = 5; String url = SlowRider.Url + "?" + SlowRider.MaxTimeParam + "=20000"; // this makes sure that the first req is still executing whilst the others run String title = "Slow Rider"; ThreadCatcher catcher = new ThreadCatcher(); Connection session = Jsoup.newSession(); // run first slow request AtomicInteger successful = new AtomicInteger(); Thread slow = new Thread(() -> { try { Document doc = session.url(url).get(); assertNotNull(doc); } catch (IOException e) { if (!isInterruptedException(e)) throw new UncheckedIOException(e); } }); slow.start(); Thread.sleep(100); // yield so that thread can start before the next do // spawn others, should fail Thread[] threads = new Thread[numThreads]; for (int threadNum = 0; threadNum < numThreads; threadNum++) { Thread thread = new Thread(() -> { try { Document doc = session.url(url).get(); assertEquals(title, doc.title()); successful.getAndIncrement(); } catch (IOException e) { throw new UncheckedIOException(e); } }); thread.setName("Runner-" + threadNum); thread.setUncaughtExceptionHandler(catcher); thread.start(); threads[threadNum] = thread; } // now join them all for (Thread thread : threads) { thread.join(); } // cancel the slow runner so we can wrap up the test quicker slow.interrupt(); // only one should have passed, rest should have blown up (assuming the started whilst other was running) //assertEquals(numThreads - 1, catcher.multiThreadExceptions.get()); //assertEquals(numThreads - 1, catcher.exceptionCount.get()); /* The checks above work when all 20 threads are executed within 10 seconds. However, depending on how cloudy it is when the CI jobs are run, they may not all complete in time. As of writing that appears most commonly on the macOS runners, which appear overtaxed. That makes this test flaky. So we relax the test conditions, and make sure at least just one passed and one failed. That's OK in prod as well, because we are only concerned about concurrent execution, which the impl does detect correctly. */ assertEquals(0, successful.get()); assertTrue(catcher.multiThreadExceptions.get() > 0); assertEquals(catcher.multiThreadExceptions.get(), catcher.exceptionCount.get()); // all exceptions are multi-threaded } @Test public void multiThreadWithProgressListener() throws InterruptedException { // tests that we can use one progress listener for multiple URLs and threads. int numThreads = 10; String[] urls = { FileServlet.urlTo("/htmltests/medium.html"), FileServlet.urlTo("/htmltests/upload-form.html"), FileServlet.urlTo("/htmltests/comments.html"), FileServlet.urlTo("/htmltests/large.html"), }; Set<String> seenUrls = ConcurrentHashMap.newKeySet(); AtomicInteger completedCount = new AtomicInteger(0); ThreadCatcher catcher = new ThreadCatcher(); Connection session = Jsoup.newSession() .onResponseProgress((processed, total, percent, response) -> { if (percent == 100.0f) { //System.out.println("Completed " + Thread.currentThread().getName() + "- " + response.url()); seenUrls.add(response.url().toExternalForm()); completedCount.incrementAndGet(); } }); Thread[] threads = new Thread[numThreads]; for (int threadNum = 0; threadNum < numThreads; threadNum++) { Thread thread = new Thread(() -> { for (String url : urls) { try { Connection con = session.newRequest().url(url); con.get(); } catch (IOException e) { throw new UncheckedIOException(e); } } }); thread.setName("Runner-" + threadNum); thread.setUncaughtExceptionHandler(catcher); thread.start(); threads[threadNum] = thread; } // now join them all for (Thread thread : threads) { thread.join(); } assertEquals(0, catcher.exceptionCount.get()); assertEquals(urls.length, seenUrls.size()); assertEquals(urls.length * numThreads, completedCount.get()); } static class ThreadCatcher implements Thread.UncaughtExceptionHandler { AtomicInteger exceptionCount = new AtomicInteger(); AtomicInteger multiThreadExceptions = new AtomicInteger(); @Override public void uncaughtException(Thread t, Throwable e) { if (e.getMessage() != null && e.getMessage().contains("Multiple threads")) { multiThreadExceptions.incrementAndGet(); } else if (!isInterruptedException(e)) { e.printStackTrace(); } exceptionCount.incrementAndGet(); } } private static boolean isInterruptedException(Throwable e) { Throwable cause = e; while (cause != null) { if (cause instanceof InterruptedException) return true; cause = cause.getCause(); } return false; } } ================================================ FILE: src/test/java/org/jsoup/integration/SessionTest.java ================================================ package org.jsoup.integration; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.integration.servlets.CookieServlet; import org.jsoup.integration.servlets.EchoServlet; import org.jsoup.integration.servlets.FileServlet; import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; import org.jsoup.parser.Tag; import org.jsoup.parser.TagSet; import org.jsoup.select.Elements; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import java.io.IOException; import java.lang.reflect.Field; import java.util.Map; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; public class SessionTest { @BeforeAll public static void setUp() { TestServer.start(); } private static Elements keyEls(String key, Document doc) { return doc.select("th:contains(" + key + ") + td"); } private static String keyText(String key, Document doc) { return doc.selectFirst("th:contains(" + key + ") + td").text(); } @Test public void testPathScopedCookies() throws IOException { final Connection session = Jsoup.newSession(); final String userAgent = "Jsoup Testalot v0.1"; session.userAgent(userAgent); session.url(CookieServlet.Url); // should have no cookies: Connection con1 = session.newRequest(); Document doc1 = con1.get(); assertEquals(0, doc1.select("table tr").size()); // none sent to servlet // set the cookies Connection con2 = session.newRequest().data(CookieServlet.SetCookiesParam, "1"); Document doc2 = con2.get(); assertEquals(0, doc2.select("table tr").size()); // none sent to servlet - we just got them! Map<String, String> cookies = con2.response().cookies(); // simple cookie response, all named "One", so should be last sent assertEquals(2, cookies.size()); assertEquals("EchoServlet", cookies.get("One")); // test that all response cookies are present, even if would not be sent for this request path (i.e. cookies() res can be set on a req, without using sessions) assertEquals("Override", cookies.get("Two")); // todo - interrogate cookie-store // check that they are sent and filtered to the right path Connection con3 = session.newRequest(); Document doc3 = con3.get(); assertCookieServlet(doc3); Document echo = session.newRequest().url(EchoServlet.Url).get(); assertEchoServlet(echo); assertEquals(userAgent, keyText("User-Agent", echo)); // check that customer user agent sent on session arrived // check that cookies aren't set out of the session Document doc4 = Jsoup.newSession().url(CookieServlet.Url).get(); assertEquals(0, doc4.select("table tr").size()); // none sent to servlet // check can add local ones also Document doc5 = session.newRequest().cookie("Bar", "Qux").get(); Elements doc5Bar = keyEls("Bar", doc5); assertEquals("Qux", doc5Bar.first().text()); } // validate that only cookies set by cookie servlet get to the cookie servlet path private void assertCookieServlet(Document doc) { assertEquals(2, doc.select("table tr").size()); // two of three sent to servlet (/ and /CookieServlet) Elements doc3Els = keyEls("One", doc); assertEquals(2, doc3Els.size()); assertEquals("CookieServlet", doc3Els.get(0).text()); // ordered by most specific path assertEquals("Root", doc3Els.get(1).text()); // ordered by most specific path } // validate that only for echo servlet private void assertEchoServlet(Document doc) { Elements echoEls = keyEls("Cookie: One", doc); // two of three sent to servlet (/ and /EchoServlet) assertEquals(2, echoEls.size()); assertEquals("EchoServlet", echoEls.get(0).text()); // ordered by most specific path - /Echo assertEquals("Root", echoEls.get(1).text()); // ordered by most specific path - / } @Test public void testPathScopedCookiesOnRedirect() throws IOException { Connection session = Jsoup.newSession(); Document doc1 = session.newRequest() .url(CookieServlet.Url) .data(CookieServlet.LocationParam, EchoServlet.Url) .data(CookieServlet.SetCookiesParam, "1") .get(); // we should be redirected to the echo servlet with cookies assertEquals(EchoServlet.Url, doc1.location()); assertEchoServlet(doc1); // checks we only have /echo cookies Document doc2 = session.newRequest() .url(EchoServlet.Url) .get(); assertEchoServlet(doc2); // test retained in session Document doc3 = session.newRequest() .url(CookieServlet.Url) .get(); assertCookieServlet(doc3); // and so were the /cookie cookies } @Test public void testCanChangeParsers() throws IOException { Connection session = Jsoup.newSession().parser(Parser.xmlParser()); String xmlUrl = FileServlet.urlTo("/htmltests/xml-test.xml"); String xmlVal = "<doc><val>One<val>Two</val>Three</val></doc>\n"; Document doc1 = session.newRequest().url(xmlUrl).get(); assertEquals(xmlVal, doc1.html()); // not HTML normed, used XML parser Document doc2 = session.newRequest().parser(Parser.htmlParser()).url(xmlUrl).get(); assertTrue(doc2.html().startsWith("<html>")); Document doc3 = session.newRequest().url(xmlUrl).get(); assertEquals(xmlVal, doc3.html()); // did not blow away xml default } @Test public void sessionTagSetDoesNotMutateRoot() { Connection session = Jsoup.newSession(); TagSet rootTags = session.request().parser().tagSet(); int rootNamespacesBefore = tagSetNamespaceCount(rootTags); Connection request = session.newRequest(); Parser parser = request.request().parser(); parser.parseInput("<custom>One <b>Two</b></custom>", "http://example.com/"); int rootNamespacesAfter = tagSetNamespaceCount(rootTags); assertEquals(rootNamespacesBefore, rootNamespacesAfter); } @Test public void sessionTagSetCustomizerDoesNotMutateRoot() { Connection session = Jsoup.newSession(); TagSet rootTags = session.request().parser().tagSet(); rootTags.onNewTag(tag -> { if (!tag.isKnownTag()) tag.set(Tag.RcData); }); int rootNamespacesBefore = tagSetNamespaceCount(rootTags); Connection request = session.newRequest(); Parser parser = request.request().parser(); Document doc = parser.parseInput("<custom>One <b>Two</b></custom>", "https://example.com/"); assertEquals(0, doc.select("custom b").size()); int rootNamespacesAfter = tagSetNamespaceCount(rootTags); assertEquals(rootNamespacesBefore, rootNamespacesAfter); } private static int tagSetNamespaceCount(TagSet tagSet) { try { Field tagsField = TagSet.class.getDeclaredField("tags"); tagsField.setAccessible(true); Map<?, ?> tags = (Map<?, ?>) tagsField.get(tagSet); return tags.size(); } catch (ReflectiveOperationException e) { throw new RuntimeException(e); } } } ================================================ FILE: src/test/java/org/jsoup/integration/TestServer.java ================================================ package org.jsoup.integration; import org.eclipse.jetty.http.HttpVersion; import org.eclipse.jetty.server.Connector; import org.eclipse.jetty.server.HttpConfiguration; import org.eclipse.jetty.server.HttpConnectionFactory; import org.eclipse.jetty.server.SecureRequestCustomizer; import org.eclipse.jetty.server.Server; import org.eclipse.jetty.server.ServerConnector; import org.eclipse.jetty.server.SslConnectionFactory; import org.eclipse.jetty.server.handler.HandlerWrapper; import org.eclipse.jetty.servlet.FilterHolder; import org.eclipse.jetty.servlet.FilterMapping; import org.eclipse.jetty.servlet.ServletHandler; import org.eclipse.jetty.util.log.Log; import org.eclipse.jetty.util.log.StdErrLog; import org.eclipse.jetty.util.ssl.SslContextFactory; import org.jsoup.integration.servlets.AuthFilter; import org.jsoup.integration.servlets.BaseServlet; import org.jsoup.integration.servlets.ProxyServlet; import javax.net.ssl.HttpsURLConnection; import javax.net.ssl.SSLContext; import javax.net.ssl.SSLSocketFactory; import javax.net.ssl.TrustManager; import javax.net.ssl.TrustManagerFactory; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.net.InetSocketAddress; import java.nio.file.Files; import java.security.KeyManagementException; import java.security.KeyStore; import java.security.KeyStoreException; import java.security.NoSuchAlgorithmException; import java.security.cert.CertificateException; import java.util.concurrent.atomic.AtomicInteger; public class TestServer { static int Port; static int TlsPort; private static final String Localhost = "localhost"; private static final String KeystorePassword = "hunter2"; private static final Server Jetty = newServer(); private static final ServletHandler JettyHandler = new ServletHandler(); private static final Server Proxy = newServer(); private static final Server AuthedProxy = newServer(); private static final HandlerWrapper ProxyHandler = new HandlerWrapper(); private static final HandlerWrapper AuthedProxyHandler = new HandlerWrapper(); private static final ProxySettings ProxySettings = new ProxySettings(); private static Server newServer() { // logs to stdout, so not highlighted as errors in Maven test runs StdErrLog logger = new StdErrLog(); logger.setStdErrStream(System.out); Log.setLog(logger); return new Server(new InetSocketAddress(Localhost, 0)); } static { Jetty.setHandler(JettyHandler); Proxy.setHandler(ProxyHandler); AuthedProxy.setHandler(AuthedProxyHandler); // TLS setup: try { File keystoreFile = ParseTest.getFile("/local-cert/server.pfx"); if (!keystoreFile.exists()) throw new FileNotFoundException(keystoreFile.toString()); addHttpsConnector(keystoreFile, Jetty); setupDefaultTrust(keystoreFile); } catch (Exception e) { throw new IllegalStateException(e); } } private TestServer() { } public static void start() { synchronized (Jetty) { if (Jetty.isStarted()) return; try { Jetty.start(); JettyHandler.addFilterWithMapping(new FilterHolder(new AuthFilter(false, false)), "/*", FilterMapping.ALL); Connector[] jcons = Jetty.getConnectors(); Port = ((ServerConnector) jcons[0]).getLocalPort(); TlsPort = ((ServerConnector) jcons[1]).getLocalPort(); ProxyHandler.setHandler(ProxyServlet.createHandler(false)); // includes proxy, CONNECT proxy, and Auth filters Proxy.start(); ProxySettings.port = ((ServerConnector) Proxy.getConnectors()[0]).getLocalPort(); AuthedProxyHandler.setHandler(ProxyServlet.createHandler(true)); AuthedProxy.start(); ProxySettings.authedPort = ((ServerConnector) AuthedProxy.getConnectors()[0]).getLocalPort(); } catch (Exception e) { throw new IllegalStateException(e); } } } /** Close any current connections to the authed proxy. Tunneled connections only authenticate in their first CONNECT, and may be kept alive and reused. So when we want to test unauthed - authed flows, we need to disconnect them first. */ static int closeAuthedProxyConnections() { ServerConnector connector = (ServerConnector) AuthedProxy.getConnectors()[0]; AtomicInteger count = new AtomicInteger(); connector.getConnectedEndPoints().forEach(endPoint -> { endPoint.close(); count.getAndIncrement(); }); return count.get(); } public static ServletUrls map(Class<? extends BaseServlet> servletClass) { synchronized (Jetty) { if (!Jetty.isStarted()) start(); // if running out of the test cases String path = "/" + servletClass.getSimpleName(); JettyHandler.addServletWithMapping(servletClass, path + "/*"); String url = "http://" + Localhost + ":" + Port + path; String tlsUrl = "https://" + Localhost + ":" + TlsPort + path; return new ServletUrls(url, tlsUrl); } } public static class ServletUrls { public final String url; public final String tlsUrl; public ServletUrls(String url, String tlsUrl) { this.url = url; this.tlsUrl = tlsUrl; } } public static ProxySettings proxySettings() { synchronized (Jetty) { if (!Jetty.isStarted()) start(); return ProxySettings; } } //public static String proxy public static class ProxySettings { final String hostname = Localhost; int port; int authedPort; } private static void addHttpsConnector(File keystoreFile, Server server) { // Cribbed from https://github.com/jetty/jetty.project/blob/jetty-9.4.x/examples/embedded/src/main/java/org/eclipse/jetty/embedded/LikeJettyXml.java SslContextFactory sslContextFactory = new SslContextFactory.Server(); String path = keystoreFile.getAbsolutePath(); sslContextFactory.setKeyStorePath(path); sslContextFactory.setKeyStorePassword(KeystorePassword); sslContextFactory.setKeyManagerPassword(KeystorePassword); sslContextFactory.setTrustStorePath(path); sslContextFactory.setTrustStorePassword(KeystorePassword); HttpConfiguration httpConfig = new HttpConfiguration(); httpConfig.setSecureScheme("https"); HttpConfiguration httpsConfig = new HttpConfiguration(httpConfig); httpsConfig.addCustomizer(new SecureRequestCustomizer()); ServerConnector sslConnector = new ServerConnector( server, new SslConnectionFactory(sslContextFactory, HttpVersion.HTTP_1_1.asString()), new HttpConnectionFactory(httpsConfig)); sslConnector.setHost(Localhost); server.addConnector(sslConnector); } private static void setupDefaultTrust(File keystoreFile) throws KeyStoreException, IOException, NoSuchAlgorithmException, CertificateException, KeyManagementException { // Configure HttpsUrlConnection (jsoup) to trust (only) this cert KeyStore trustStore = KeyStore.getInstance(KeyStore.getDefaultType()); trustStore.load(Files.newInputStream(keystoreFile.toPath()), KeystorePassword.toCharArray()); TrustManagerFactory trustManagerFactory = TrustManagerFactory.getInstance(TrustManagerFactory.getDefaultAlgorithm()); trustManagerFactory.init(trustStore); TrustManager[] managers = trustManagerFactory.getTrustManagers(); SSLContext tls = SSLContext.getInstance("TLS"); tls.init(null, managers, null); SSLContext.setDefault(tls); } } ================================================ FILE: src/test/java/org/jsoup/integration/servlets/AuthFilter.java ================================================ package org.jsoup.integration.servlets; import javax.servlet.Filter; import javax.servlet.FilterChain; import javax.servlet.FilterConfig; import javax.servlet.ServletException; import javax.servlet.ServletRequest; import javax.servlet.ServletResponse; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.Base64; /** A filter to test basic authenticated requests. If the request header "X-Wants-Authentication" is set, or if alwaysWantsAuth is enabled, the filter is invoked, and requests must send the correct user authentication details. */ public class AuthFilter implements Filter { public static final String WantsServerAuthentication = "X-Wants-ServerAuthentication"; public static final String ServerUser = "admin"; public static final String ServerRealm = "jsoup test server authentication realm"; private static volatile String ServerPassword = newServerPassword(); public static final String WantsProxyAuthentication = "X-Wants-ProxyAuthentication"; public static final String ProxyUser = "foxyproxy"; public static final String ProxyRealm = "jsoup test proxy authentication realm"; private static volatile String ProxyPassword = newProxyPassword(); private final boolean alwaysWantsAuth; // we run a particular port that always wants auth - so the CONNECT tunnels can be authed. (The Java proxy tunnel CONNECT request strips the wants-auth headers) private final boolean forProxy; private final String wantsHeader; private final String authorizationHeader; /** Creates an Authentication Filter with hardcoded credential expectations. * @param alwaysWantsAuth true if this filter should always check for authentication, regardless of the Wants Auth header * @param forProxy true if this wraps a Proxy and should use Proxy-Authenticate headers, credentials etc. False * if wrapping the web server. */ public AuthFilter(boolean alwaysWantsAuth, boolean forProxy) { this.alwaysWantsAuth = alwaysWantsAuth; this.forProxy = forProxy; wantsHeader = forProxy ? WantsProxyAuthentication : WantsServerAuthentication; authorizationHeader = forProxy ? "Proxy-Authorization" : "Authorization"; } private static String newPassword() { return "pass-" + Math.random(); } // passwords get rotated in tests so that Java's auth cache is invalidated and a new auth callback occurs. // requires tests hitting these are called serially. public static String newServerPassword() { return ServerPassword = newPassword() + "-server"; } public static String newProxyPassword() { return ProxyPassword = newPassword() + "-proxy"; } @Override public void init(FilterConfig filterConfig) throws ServletException {} @Override public void doFilter(ServletRequest request, ServletResponse response, FilterChain chain) throws IOException, ServletException { HttpServletRequest req = (HttpServletRequest) request; HttpServletResponse res = (HttpServletResponse) response; boolean accessGranted = checkAuth(req); if (accessGranted) { chain.doFilter(request, response); return; } // Wants but failed auth - send appropriate header: if (forProxy) { res.setHeader("Proxy-Authenticate", "Basic realm=\"" + ProxyRealm + "\""); // ^^ Duped in ProxyServlet for CONNECT res.sendError(HttpServletResponse.SC_PROXY_AUTHENTICATION_REQUIRED); } else { res.setHeader("WWW-Authenticate", "Basic realm=\"" + ServerRealm + "\""); res.sendError(HttpServletResponse.SC_UNAUTHORIZED); } } @Override public void destroy() {} public boolean checkAuth(HttpServletRequest req) { if (alwaysWantsAuth || req.getHeader(wantsHeader) != null) { String authHeader = req.getHeader(authorizationHeader); if (authHeader != null) { int space = authHeader.indexOf(' '); if (space > 0) { String value = authHeader.substring(space + 1); String expected = forProxy ? (ProxyUser + ":" + ProxyPassword) : (ServerUser + ":" + ServerPassword); String base64 = Base64.getEncoder().encodeToString(expected.getBytes(StandardCharsets.UTF_8)); return base64.equals(value); // if passed auth } } return false; // unexpected header value } return true; // auth not required } } ================================================ FILE: src/test/java/org/jsoup/integration/servlets/BaseServlet.java ================================================ package org.jsoup.integration.servlets; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import java.io.IOException; public abstract class BaseServlet extends HttpServlet { static final String TextHtml = "text/html; charset=UTF-8"; abstract protected void doIt(HttpServletRequest req, HttpServletResponse res) throws IOException, ServletException; @Override protected void doGet(HttpServletRequest req, HttpServletResponse res) throws IOException, ServletException { doIt(req, res); } @Override protected void doPost(HttpServletRequest req, HttpServletResponse res) throws IOException, ServletException { doIt(req, res); } @Override protected void doPut(HttpServletRequest req, HttpServletResponse res) throws IOException, ServletException { doIt(req, res); } @Override protected void doDelete(HttpServletRequest req, HttpServletResponse res) throws IOException, ServletException { doIt(req, res); } } ================================================ FILE: src/test/java/org/jsoup/integration/servlets/CookieServlet.java ================================================ package org.jsoup.integration.servlets; import org.jsoup.integration.TestServer; import javax.servlet.http.Cookie; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import java.io.IOException; import java.io.PrintWriter; public class CookieServlet extends BaseServlet { public static final String Url; public static final String TlsUrl; static { TestServer.ServletUrls urls = TestServer.map(CookieServlet.class); Url = urls.url; TlsUrl = urls.tlsUrl; } public static final String SetCookiesParam = "setCookies"; public static final String LocationParam = "loc"; @Override protected void doIt(HttpServletRequest req, HttpServletResponse res) throws IOException { // Do we want to set cookies? if (req.getParameter(SetCookiesParam) != null) setCookies(res); // Do we want to redirect elsewhere? String loc = req.getParameter(LocationParam); if (loc != null) { res.sendRedirect(loc); return; } // print out the cookies that were received res.setContentType(TextHtml); res.setStatus(200); PrintWriter w = res.getWriter(); w.println("<table>"); final Cookie[] cookies = req.getCookies(); if (cookies != null) { for (Cookie cookie : cookies) { EchoServlet.write(w, cookie.getName(), cookie.getValue()); } } w.println("</table>"); } private void setCookies(HttpServletResponse res) { Cookie one = new Cookie("One", "Root"); one.setPath("/"); res.addCookie(one); Cookie two = new Cookie("One", "CookieServlet"); two.setPath("/CookieServlet"); two.setHttpOnly(true); two.setComment("Quite nice"); res.addCookie(two); Cookie three = new Cookie("One", "EchoServlet"); three.setPath("/EchoServlet"); res.addCookie(three); Cookie four = new Cookie("Two", "NoSuchPath"); four.setPath("/bogus"); res.addCookie(four); Cookie five = new Cookie("Two", "Override"); five.setPath("/bogus"); res.addCookie(five); } } ================================================ FILE: src/test/java/org/jsoup/integration/servlets/DeflateServlet.java ================================================ package org.jsoup.integration.servlets; import org.jsoup.integration.TestServer; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.zip.Deflater; import java.util.zip.DeflaterOutputStream; public class DeflateServlet extends BaseServlet { public static final String Url; public static final String TlsUrl; static { TestServer.ServletUrls urls = TestServer.map(DeflateServlet.class); Url = urls.url; TlsUrl = urls.tlsUrl; } @Override protected void doIt(HttpServletRequest req, HttpServletResponse res) throws IOException { res.setContentType(TextHtml); res.setStatus(HttpServletResponse.SC_OK); res.setHeader("Content-Encoding", "deflate"); String doc = "<p>Hello, World!<p>That should be enough, right?<p>Hello, World!<p>That should be enough, right?"; DeflaterOutputStream stream = new DeflaterOutputStream( res.getOutputStream(), new Deflater(Deflater.BEST_COMPRESSION, true)); // true = nowrap zlib headers stream.write(doc.getBytes(StandardCharsets.UTF_8)); stream.close(); } // allow the servlet to run as a main program, for local test public static void main(String[] args) { TestServer.start(); System.out.println(Url); } } ================================================ FILE: src/test/java/org/jsoup/integration/servlets/EchoServlet.java ================================================ package org.jsoup.integration.servlets; import org.eclipse.jetty.server.Request; import org.jsoup.helper.DataUtil; import org.jsoup.internal.StringUtil; import org.jsoup.integration.TestServer; import javax.servlet.MultipartConfigElement; import javax.servlet.ServletException; import javax.servlet.http.Cookie; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import javax.servlet.http.Part; import java.io.IOException; import java.io.PrintWriter; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.Collection; import java.util.Enumeration; import static org.jsoup.nodes.Entities.escape; public class EchoServlet extends BaseServlet { public static final String CodeParam = "code"; private static final int DefaultCode = HttpServletResponse.SC_OK; public static final String Url; public static final String TlsUrl; static { TestServer.ServletUrls urls = TestServer.map(EchoServlet.class); Url = urls.url; TlsUrl = urls.tlsUrl; } @Override protected void doIt(HttpServletRequest req, HttpServletResponse res) throws IOException, ServletException { int intCode = DefaultCode; String code = req.getHeader(CodeParam); if (code != null) intCode = Integer.parseInt(code); boolean isMulti = maybeEnableMultipart(req); res.setContentType(TextHtml); res.setStatus(intCode); // no-cache headers for test res.addHeader("Cache-Control", "no-cache"); res.addHeader("Cache-Control", "no-store"); PrintWriter w = res.getWriter(); w.write("<title>Webserver Environment Variables\n" + " \n" + " \n" + " "); // some get items write(w, "Method", req.getMethod()); write(w, "Request URI", req.getRequestURI()); write(w, "Path Info", req.getPathInfo()); write(w, "Query String", req.getQueryString()); // request headers (why is it an enumeration?) Enumeration headerNames = req.getHeaderNames(); while (headerNames.hasMoreElements()) { String header = headerNames.nextElement(); Enumeration headers = req.getHeaders(header); while (headers.hasMoreElements()) { write(w, header, headers.nextElement()); } } // cookies final Cookie[] cookies = req.getCookies(); if (cookies != null) { for (Cookie cookie : cookies) { EchoServlet.write(w, "Cookie: " + cookie.getName(), cookie.getValue()); } } // the request params Enumeration parameterNames = req.getParameterNames(); while (parameterNames.hasMoreElements()) { String name = parameterNames.nextElement(); String[] values = req.getParameterValues(name); write(w, name, StringUtil.join(values, ", ")); } // post body ByteBuffer byteBuffer = DataUtil.readToByteBuffer(req.getInputStream(), 0); String postData = new String(byteBuffer.array(), byteBuffer.arrayOffset(), byteBuffer.limit(), StandardCharsets.UTF_8); if (!StringUtil.isBlank(postData)) { write(w, "Post Data", postData); } // file uploads if (isMulti) { Collection parts = req.getParts(); write(w, "Parts", String.valueOf(parts.size())); for (Part part : parts) { String name = part.getName(); write(w, "Part " + name + " ContentType", part.getContentType()); write(w, "Part " + name + " Name", name); write(w, "Part " + name + " Filename", part.getSubmittedFileName()); write(w, "Part " + name + " Size", String.valueOf(part.getSize())); part.delete(); } } w.println("
"); } static void write(PrintWriter w, String key, String val) { w.println("" + escape(key) + "" + escape(val) + ""); } // allow the servlet to run as a main program, for local test public static void main(String[] args) { TestServer.start(); System.out.println("Listening on " + Url + " and " + TlsUrl); } private static boolean maybeEnableMultipart(HttpServletRequest req) { boolean isMulti = req.getContentType() != null && req.getContentType().startsWith("multipart/form-data"); if (isMulti) { req.setAttribute(Request.MULTIPART_CONFIG_ELEMENT, new MultipartConfigElement( System.getProperty("java.io.tmpdir"))); } return isMulti; } } ================================================ FILE: src/test/java/org/jsoup/integration/servlets/FileServlet.java ================================================ package org.jsoup.integration.servlets; import org.jsoup.integration.ParseTest; import org.jsoup.integration.TestServer; import javax.servlet.ServletOutputStream; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import java.io.File; import java.io.IOException; import java.nio.file.Files; public class FileServlet extends BaseServlet { public static final String Url; public static final String TlsUrl; static { TestServer.ServletUrls urls = TestServer.map(FileServlet.class); Url = urls.url; TlsUrl = urls.tlsUrl; } public static final String ContentTypeParam = "contentType"; public static final String HtmlType = "text/html"; static final String XmlType = "text/xml"; public static final String SuppressContentLength = "surpriseMe"; @Override protected void doIt(HttpServletRequest req, HttpServletResponse res) throws IOException { String contentType = req.getParameter(ContentTypeParam); if (contentType == null) { contentType = HtmlType; if (req.getPathInfo().contains(".xml")) contentType = XmlType; } String location = req.getPathInfo(); File file = ParseTest.getFile(location); if (file.exists()) { res.setContentType(contentType); if (file.getName().endsWith("gz")) res.addHeader("Content-Encoding", "gzip"); if (req.getParameter(SuppressContentLength) == null) res.setContentLength((int) file.length()); res.setStatus(HttpServletResponse.SC_OK); ServletOutputStream out = res.getOutputStream(); Files.copy(file.toPath(), out); out.flush(); } else { res.sendError(HttpServletResponse.SC_NOT_FOUND); } } public static String urlTo(String path) { return Url + path; } public static String tlsUrlTo(String path) { return TlsUrl + path; } } ================================================ FILE: src/test/java/org/jsoup/integration/servlets/HelloServlet.java ================================================ package org.jsoup.integration.servlets; import org.jsoup.integration.TestServer; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import java.io.IOException; public class HelloServlet extends BaseServlet { public static final String Url; public static final String TlsUrl; static { TestServer.ServletUrls urls = TestServer.map(HelloServlet.class); Url = urls.url; TlsUrl = urls.tlsUrl; } @Override protected void doIt(HttpServletRequest req, HttpServletResponse res) throws IOException { res.setContentType(TextHtml); res.setStatus(HttpServletResponse.SC_OK); String doc = "

Hello, World!"; res.getWriter().write(doc); } } ================================================ FILE: src/test/java/org/jsoup/integration/servlets/InterruptedServlet.java ================================================ package org.jsoup.integration.servlets; import org.jsoup.integration.TestServer; import org.jsoup.parser.CharacterReaderTest; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import java.io.IOException; public class InterruptedServlet extends BaseServlet { public static final String Url; public static final String TlsUrl; static { TestServer.ServletUrls urls = TestServer.map(InterruptedServlet.class); Url = urls.url; TlsUrl = urls.tlsUrl; } public static final String Magnitude = "magnitude"; public static final String Larger = "larger"; @Override protected void doIt(HttpServletRequest req, HttpServletResponse res) throws IOException { String magnitude = req.getParameter(Magnitude); magnitude = magnitude == null ? "" : magnitude; res.setContentType(TextHtml); res.setStatus(HttpServletResponse.SC_OK); StringBuilder sb = new StringBuilder(); sb.append("Something"); while (sb.length() <= 32 * 1024) { sb.append("

A suitable amount of data.
\n"); } sb.append("

Finale.

"); String data = sb.toString(); int contentLength = magnitude.equals(Larger) ? data.length() * 2 : data.length() / 2; res.setContentLength(contentLength); res.getWriter().write(data); } } ================================================ FILE: src/test/java/org/jsoup/integration/servlets/ProxyServlet.java ================================================ package org.jsoup.integration.servlets; import org.eclipse.jetty.client.api.Response; import org.eclipse.jetty.proxy.AsyncProxyServlet; import org.eclipse.jetty.proxy.ConnectHandler; import org.eclipse.jetty.server.Handler; import org.eclipse.jetty.servlet.FilterHolder; import org.eclipse.jetty.servlet.FilterMapping; import org.eclipse.jetty.servlet.ServletHandler; import org.eclipse.jetty.servlet.ServletHolder; import org.jsoup.integration.TestServer; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import static org.jsoup.integration.servlets.AuthFilter.ProxyRealm; public class ProxyServlet extends AsyncProxyServlet { public static TestServer.ProxySettings ProxySettings = TestServer.proxySettings(); public static String Via = "1.1 jsoup test proxy"; static { System.setProperty("jdk.http.auth.tunneling.disabledSchemes", ""); // removes Basic, which is otherwise excluded from auth for CONNECT tunnels } public static Handler createHandler(boolean alwaysAuth) { // ConnectHandler wraps this ProxyServlet and handles CONNECT, which sets up a tunnel for HTTPS requests and is // opaque to the proxy. The ProxyServlet handles simple HTTP requests. AuthFilter authFilter = new AuthFilter(alwaysAuth, true); ConnectHandler connectHandler = new ConnectProxy(authFilter); ServletHandler proxyHandler = new ServletHandler(); proxyHandler.addFilterWithMapping(new FilterHolder(authFilter), "/*", FilterMapping.ALL); // auth for HTTP proxy ServletHolder proxyServletHolder = new ServletHolder(ProxyServlet.class); // Holder wraps as it requires maxThreads initialization proxyServletHolder.setAsyncSupported(true); proxyServletHolder.setInitParameter("maxThreads", "200"); proxyHandler.addServletWithMapping(proxyServletHolder, "/*"); connectHandler.setHandler(proxyHandler); return connectHandler; } @Override protected void onServerResponseHeaders(HttpServletRequest clientRequest, HttpServletResponse proxyResponse, Response serverResponse) { super.onServerResponseHeaders(clientRequest, proxyResponse, serverResponse); proxyResponse.addHeader("Via", Via); } /** Supports CONNECT tunnels */ static class ConnectProxy extends ConnectHandler { final AuthFilter authFilter; public ConnectProxy(AuthFilter authFilter) { this.authFilter = authFilter; } @Override protected boolean handleAuthentication(HttpServletRequest req, HttpServletResponse res, String address) { boolean accessGranted = authFilter.checkAuth(req); //System.err.println("CONNECT AUTH: " + accessGranted); // need to add the desired auth header if not granted. Returning false here will also send 407 header if (!accessGranted) { res.setHeader("Proxy-Authenticate", "Basic realm=\"" + ProxyRealm + "\""); } return accessGranted; } } } ================================================ FILE: src/test/java/org/jsoup/integration/servlets/RedirectServlet.java ================================================ package org.jsoup.integration.servlets; import org.jsoup.integration.TestServer; import javax.servlet.http.Cookie; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import java.io.IOException; public class RedirectServlet extends BaseServlet { public static final String Url; public static final String TlsUrl; static { TestServer.ServletUrls urls = TestServer.map(RedirectServlet.class); Url = urls.url; TlsUrl = urls.tlsUrl; } public static final String LocationParam = "loc"; public static final String CodeParam = "code"; public static final String SetCookiesParam = "setCookies"; private static final int DefaultCode = HttpServletResponse.SC_MOVED_TEMPORARILY; @Override protected void doIt(HttpServletRequest req, HttpServletResponse res) throws IOException { String location = req.getParameter(LocationParam); if (location == null) location = ""; int intCode = DefaultCode; String code = req.getParameter(CodeParam); if (code != null) intCode = Integer.parseInt(code); if (req.getParameter(SetCookiesParam) != null) { res.addCookie(new Cookie("token", "asdfg123")); res.addCookie(new Cookie("uid", "foobar")); res.addCookie(new Cookie("uid", "jhy")); // dupe, should use latter } res.setHeader("Location", location); res.setStatus(intCode); res.flushBuffer(); } } ================================================ FILE: src/test/java/org/jsoup/integration/servlets/SlowRider.java ================================================ package org.jsoup.integration.servlets; import org.jsoup.integration.TestServer; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import java.io.IOException; import java.io.PrintWriter; /** * Slowly, interminably writes output. For the purposes of testing timeouts and interrupts. */ public class SlowRider extends BaseServlet { public static final String Url; public static final String TlsUrl; static { TestServer.ServletUrls urls = TestServer.map(SlowRider.class); Url = urls.url; TlsUrl = urls.tlsUrl; } private static final int SleepTime = 2000; public static final String MaxTimeParam = "maxTime"; public static final String IntroSizeParam = "introSize"; @Override protected void doIt(HttpServletRequest req, HttpServletResponse res) throws IOException { pause(1000); res.setContentType(TextHtml); res.setStatus(HttpServletResponse.SC_OK); PrintWriter w = res.getWriter(); int maxTime = -1; String maxTimeP = req.getParameter(MaxTimeParam); if (maxTimeP != null) { maxTime = Integer.parseInt(maxTimeP); } int introSize = 0; String introSizeP = req.getParameter(IntroSizeParam); if (introSizeP != null) { introSize = Integer.parseInt(introSizeP); } long startTime = System.currentTimeMillis(); w.println("Slow Rider"); // write out a bunch of stuff at the start before interim pauses, gets past some buffers if (introSize != 0) { StringBuilder s = new StringBuilder(); while (s.length() < introSize) { s.append("

Hello and welcome to the Slow Rider!

\n"); } w.println(s); w.flush(); } while (true) { w.println("

Are you still there?"); boolean err = w.checkError(); // flush, and check still ok if (err) { log("Remote connection lost"); break; } if (pause(SleepTime)) break; if (maxTime > 0 && System.currentTimeMillis() > startTime + maxTime) { w.println("

outatime

"); break; } } } private static boolean pause(int sleepTime) { try { Thread.sleep(sleepTime); } catch (InterruptedException e) { return true; } return false; } // allow the servlet to run as a main program, for local test public static void main(String[] args) { TestServer.start(); System.out.println(Url); } } ================================================ FILE: src/test/java/org/jsoup/internal/ControllableInputStreamTest.java ================================================ package org.jsoup.internal; import org.junit.jupiter.api.Test; import java.io.ByteArrayInputStream; import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; import static org.junit.jupiter.api.Assertions.*; class ControllableInputStreamTest { @Test void respectsMaxCapDuringFill() throws IOException { byte[] data = "0123456789".getBytes(); // 10 bytes CountingInputStream counting = new CountingInputStream(new ByteArrayInputStream(data)); ControllableInputStream in = ControllableInputStream.wrap(counting, 5); // cap at 5 bytes byte[] buf = new byte[10]; int read = in.read(buf); assertEquals(5, read, "should only read up to cap"); assertEquals(5, counting.count, "underlying stream should not be pulled past cap"); assertFalse(in.baseReadFully(), "cap hit is not EOF"); int second = in.read(buf); assertEquals(-1, second, "further reads return -1 once cap is exhausted"); assertFalse(in.baseReadFully(), "still not true EOF"); in.close(); } @Test void compactsBufferWithActiveMark() throws IOException { int size = SharedConstants.DefaultBufferSize * 2; byte[] data = new byte[size]; for (int i = 0; i < size; i++) data[i] = (byte) (i % 256); ControllableInputStream in = ControllableInputStream.wrap(new ByteArrayInputStream(data), 0); byte[] first = new byte[500]; assertEquals(500, in.read(first)); in.mark(SharedConstants.DefaultBufferSize); // mark at logical pos 500 byte[] consume = new byte[SharedConstants.DefaultBufferSize]; int firstRead = in.read(consume); // serves remainder of current buffer (BufferSize - 500) assertEquals(SharedConstants.DefaultBufferSize - 500, firstRead); byte[] more = new byte[1000]; int secondRead = in.read(more); // triggers fill() with active mark, then consumes from freshly filled buffer assertEquals(SharedConstants.DefaultBufferSize - firstRead, secondRead); in.reset(); // should rewind to mark despite prior compaction byte[] reread = new byte[1000]; assertEquals(1000, in.read(reread)); for (int i = 0; i < reread.length; i++) { assertEquals(data[500 + i], reread[i], "byte mismatch at " + i); } in.close(); } private static final class CountingInputStream extends FilterInputStream { int count = 0; CountingInputStream(InputStream in) { super(in); } @Override public int read(byte[] b, int off, int len) throws IOException { int r = super.read(b, off, len); if (r > 0) count += r; return r; } @Override public int read() throws IOException { int r = super.read(); if (r != -1) count++; return r; } } } ================================================ FILE: src/test/java/org/jsoup/internal/QuietAppendableTest.java ================================================ package org.jsoup.internal; import org.jsoup.Jsoup; import org.jsoup.SerializationException; import org.jsoup.nodes.Document; import org.junit.jupiter.api.Test; import java.io.CharArrayWriter; import java.io.IOException; import static org.junit.jupiter.api.Assertions.*; class QuietAppendableTest { @Test void wrap() { assertInstanceOf( QuietAppendable.StringBuilderAppendable.class, QuietAppendable.wrap(new StringBuilder()) ); assertInstanceOf( QuietAppendable.BaseAppendable.class, QuietAppendable.wrap(new CharArrayWriter()) ); } @Test void supplemental() { // hits append(char[] chars, int offset, int len) with len 2 for supplemental codepoint String expect = "😀"; char[] chars = new char[2]; chars[0] = expect.charAt(0); chars[1] = expect.charAt(1); assertEquals(2, expect.length()); QuietAppendable sb = QuietAppendable.wrap(new StringBuilder()); sb.append(chars, 0, 2); String s = sb.toString(); assertEquals(expect, s); CharArrayWriter cw = new CharArrayWriter(); QuietAppendable qa = QuietAppendable.wrap(cw); qa.append(chars, 0, 2); String out = cw.toString(); assertEquals(expect, out); } private static Appendable brokenAppender() { // returns an Appendable that throws an IOException on any put return new Appendable() { @Override public Appendable append(CharSequence csq) throws IOException { throw new IOException("broken"); } @Override public Appendable append(CharSequence csq, int start, int end) throws IOException { throw new IOException("broken"); } @Override public Appendable append(char c) throws IOException { throw new IOException("broken"); } }; } @Test void appendThrowsSerializationException() { Document doc = Jsoup.parse("
"); Appendable brokenWriter = brokenAppender(); boolean threw = false; try { doc.html(brokenWriter); } catch (SerializationException e) { threw = true; Throwable cause = e.getCause(); assertEquals("broken", cause.getMessage()); assertInstanceOf(IOException.class, cause); } assertTrue(threw); } } ================================================ FILE: src/test/java/org/jsoup/internal/ReaderTest.java ================================================ package org.jsoup.internal; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.parser.CharacterReader; import org.jsoup.parser.Parser; import org.junit.jupiter.api.Test; import java.io.FileInputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import static org.jsoup.integration.ParseTest.getPath; import static org.junit.jupiter.api.Assertions.assertEquals; public class ReaderTest { @Test void readerOfStringAndFile() throws IOException { // make sure that reading from a String and from a File produce the same bytes Path path = getPath("/fuzztests/garble.html"); byte[] bytes = Files.readAllBytes(path); String fromBytes = new String(bytes, StandardCharsets.UTF_8); SimpleStreamReader streamReader = getReader(path); String fromStream = getString(streamReader); assertEquals(fromBytes, fromStream); SimpleStreamReader reader2 = getReader(path); CharacterReader cr = new CharacterReader(reader2); String fullRead = cr.consumeTo('X'); // does not exist in input assertEquals(fromBytes, fullRead); } private static String getString(SimpleStreamReader streamReader) throws IOException { // read streamreader to a string: StringBuilder builder = new StringBuilder(); char[] cbuffer = new char[1024]; int read; while ((read = streamReader.read(cbuffer)) != -1) { builder.append(cbuffer, 0, read); } return builder.toString(); } private static SimpleStreamReader getReader(Path path) throws IOException { // set up a chain as in when we parse: simplebufferedinput -> controllableinputstream -> simplestreamreader -> characterreader SimpleBufferedInput input = new SimpleBufferedInput(Files.newInputStream(path)); ControllableInputStream stream = ControllableInputStream.wrap(input, 0); return new SimpleStreamReader(stream, StandardCharsets.UTF_8); } } ================================================ FILE: src/test/java/org/jsoup/internal/SoftPoolTest.java ================================================ package org.jsoup.internal; import org.junit.jupiter.api.Test; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.atomic.AtomicInteger; import static org.junit.jupiter.api.Assertions.*; public class SoftPoolTest { private static final int BufSize = 12; private static final int NumThreads = 5; private static final int NumObjects = 3; @Test public void testSoftLocalPool() throws InterruptedException { SoftPool softLocalPool = new SoftPool<>(() -> new char[BufSize]); ExecutorService executorService = Executors.newFixedThreadPool(NumThreads); CountDownLatch latch = new CountDownLatch(NumThreads); Set allBuffers = new HashSet<>(); Set[] threadLocalBuffers = new Set[NumThreads]; for (int i = 0; i < NumThreads; i++) { threadLocalBuffers[i] = new HashSet<>(); } AtomicInteger threadCount = new AtomicInteger(); Runnable task = () -> { try { int threadIndex = threadCount.getAndIncrement(); Set localBuffers = new HashSet<>(); // First borrow for (int i = 0; i < NumObjects; i++) { char[] buffer = softLocalPool.borrow(); assertEquals(BufSize, buffer.length); localBuffers.add(buffer); } // Release buffers back to the pool for (char[] buffer : localBuffers) { softLocalPool.release(buffer); } // Borrow again and ensure buffers are reused for (int i = 0; i < NumObjects; i++) { char[] buffer = softLocalPool.borrow(); assertTrue(localBuffers.contains(buffer), "Buffer was not reused in the same thread"); threadLocalBuffers[threadIndex].add(buffer); } synchronized (allBuffers) { allBuffers.addAll(threadLocalBuffers[threadIndex]); } } finally { latch.countDown(); } }; // Run the tasks for (int i = 0; i < NumThreads; i++) { executorService.submit(task::run); } // Wait for all threads to complete latch.await(); executorService.shutdown(); // Ensure no buffers are shared between threads Set uniqueBuffers = new HashSet<>(); for (Set bufferSet : threadLocalBuffers) { for (char[] buffer : bufferSet) { assertTrue(uniqueBuffers.add(buffer), "Buffer was shared between threads"); } } } @Test public void testSoftReferenceBehavior() { SoftPool softLocalPool = new SoftPool<>(() -> new char[BufSize]); // Borrow and release an object char[] buffer = softLocalPool.borrow(); assertEquals(BufSize, buffer.length); softLocalPool.release(buffer); // Fake a GC softLocalPool.threadLocalStack.get().clear(); // Ensure the object is garbage collected assertNull(softLocalPool.threadLocalStack.get().get()); char[] second = softLocalPool.borrow(); // should be different, but same size assertNotEquals(buffer, second); assertEquals(BufSize, second.length); } @Test public void testBorrowFromEmptyPool() { SoftPool softLocalPool = new SoftPool<>(() -> new char[BufSize]); // Borrow from an empty pool char[] buffer = softLocalPool.borrow(); assertNotNull(buffer, "Borrowed null from an empty pool"); assertEquals(BufSize, buffer.length); } @Test public void testReleaseMoreThanMaxIdle() { SoftPool softLocalPool = new SoftPool<>(() -> new char[BufSize]); // Borrow more than MaxIdle objects List borrowedBuffers = new ArrayList<>(); for (int i = 0; i < SoftPool.MaxIdle + 5; i++) { char[] buffer = softLocalPool.borrow(); borrowedBuffers.add(buffer); } // Release all borrowed objects back to the pool for (char[] buffer : borrowedBuffers) { softLocalPool.release(buffer); } // Ensure the pool size does not exceed MaxIdle ArrayDeque stack = softLocalPool.getStack(); assertTrue(stack.size() <= SoftPool.MaxIdle, "Pool size exceeded MaxIdle limit"); } } ================================================ FILE: src/test/java/org/jsoup/internal/StringUtilTest.java ================================================ package org.jsoup.internal; import org.jsoup.Jsoup; import org.junit.jupiter.api.Test; import java.util.Arrays; import java.util.Collections; import static org.jsoup.internal.StringUtil.normaliseWhitespace; import static org.jsoup.internal.StringUtil.resolve; import static org.junit.jupiter.api.Assertions.*; public class StringUtilTest { @Test public void join() { assertEquals("", StringUtil.join(Collections.singletonList(""), " ")); assertEquals("one", StringUtil.join(Collections.singletonList("one"), " ")); assertEquals("one two three", StringUtil.join(Arrays.asList("one", "two", "three"), " ")); } @Test public void padding() { assertEquals("", StringUtil.padding(0)); assertEquals(" ", StringUtil.padding(1)); assertEquals(" ", StringUtil.padding(2)); assertEquals(" ", StringUtil.padding(15)); assertEquals(" ", StringUtil.padding(45)); // we default to tap out at 30 // memoization is up to 21 blocks (0 to 20 spaces) and exits early before min checks making maxPaddingWidth unused assertEquals("", StringUtil.padding(0, -1)); assertEquals(" ", StringUtil.padding(20, -1)); // this test escapes memoization and continues through assertEquals(" ", StringUtil.padding(21, -1)); // this test escapes memoization and using unlimited length (-1) will allow requested spaces assertEquals(" ", StringUtil.padding(30, -1)); assertEquals(" ", StringUtil.padding(45, -1)); // we tap out at 0 for this test assertEquals("", StringUtil.padding(0, 0)); // as memoization is escaped, setting zero for max padding will not allow any requested width assertEquals("", StringUtil.padding(21, 0)); // we tap out at 30 for these tests making > 30 use 30 assertEquals("", StringUtil.padding(0, 30)); assertEquals(" ", StringUtil.padding(1, 30)); assertEquals(" ", StringUtil.padding(2, 30)); assertEquals(" ", StringUtil.padding(15, 30)); assertEquals(" ", StringUtil.padding(45, 30)); // max applies regardless of memoized assertEquals(5, StringUtil.padding(20, 5).length()); } @Test public void paddingInACan() { String[] padding = StringUtil.padding; assertEquals(21, padding.length); for (int i = 0; i < padding.length; i++) { assertEquals(i, padding[i].length()); } } @Test public void isBlank() { assertTrue(StringUtil.isBlank(null)); assertTrue(StringUtil.isBlank("")); assertTrue(StringUtil.isBlank(" ")); assertTrue(StringUtil.isBlank(" \r\n ")); assertFalse(StringUtil.isBlank("hello")); assertFalse(StringUtil.isBlank(" hello ")); } @Test public void isNumeric() { assertFalse(StringUtil.isNumeric(null)); assertFalse(StringUtil.isNumeric(" ")); assertFalse(StringUtil.isNumeric("123 546")); assertFalse(StringUtil.isNumeric("hello")); assertFalse(StringUtil.isNumeric("123.334")); assertTrue(StringUtil.isNumeric("1")); assertTrue(StringUtil.isNumeric("1234")); } @Test public void isWhitespace() { assertTrue(StringUtil.isWhitespace('\t')); assertTrue(StringUtil.isWhitespace('\n')); assertTrue(StringUtil.isWhitespace('\r')); assertTrue(StringUtil.isWhitespace('\f')); assertTrue(StringUtil.isWhitespace(' ')); assertFalse(StringUtil.isWhitespace('\u00a0')); assertFalse(StringUtil.isWhitespace('\u2000')); assertFalse(StringUtil.isWhitespace('\u3000')); } @Test public void normaliseWhiteSpace() { assertEquals(" ", normaliseWhitespace(" \r \n \r\n")); assertEquals(" hello there ", normaliseWhitespace(" hello \r \n there \n")); assertEquals("hello", normaliseWhitespace("hello")); assertEquals("hello there", normaliseWhitespace("hello\nthere")); } @Test public void normaliseWhiteSpaceHandlesHighSurrogates() { String test71540chars = "\ud869\udeb2\u304b\u309a 1"; String test71540charsExpectedSingleWhitespace = "\ud869\udeb2\u304b\u309a 1"; assertEquals(test71540charsExpectedSingleWhitespace, normaliseWhitespace(test71540chars)); String extractedText = Jsoup.parse(test71540chars).text(); assertEquals(test71540charsExpectedSingleWhitespace, extractedText); } @Test public void resolvesRelativeUrls() { assertEquals("http://example.com/one/two?three", resolve("http://example.com", "./one/two?three")); assertEquals("http://example.com/one/two?three", resolve("http://example.com?one", "./one/two?three")); assertEquals("http://example.com/one/two?three#four", resolve("http://example.com", "./one/two?three#four")); assertEquals("https://example.com/one", resolve("http://example.com/", "https://example.com/one")); assertEquals("http://example.com/one/two.html", resolve("http://example.com/two/", "../one/two.html")); assertEquals("https://example2.com/one", resolve("https://example.com/", "//example2.com/one")); assertEquals("https://example.com:8080/one", resolve("https://example.com:8080", "./one")); assertEquals("https://example2.com/one", resolve("http://example.com/", "https://example2.com/one")); assertEquals("https://example.com/one", resolve("wrong", "https://example.com/one")); assertEquals("https://example.com/one", resolve("https://example.com/one", "")); assertEquals("", resolve("wrong", "also wrong")); assertEquals("ftp://example.com/one", resolve("ftp://example.com/two/", "../one")); assertEquals("ftp://example.com/one/two.c", resolve("ftp://example.com/one/", "./two.c")); assertEquals("ftp://example.com/one/two.c", resolve("ftp://example.com/one/", "two.c")); // examples taken from rfc3986 section 5.4.2 assertEquals("http://example.com/g", resolve("http://example.com/b/c/d;p?q", "../../../g")); assertEquals("http://example.com/g", resolve("http://example.com/b/c/d;p?q", "../../../../g")); assertEquals("http://example.com/g", resolve("http://example.com/b/c/d;p?q", "/./g")); assertEquals("http://example.com/g", resolve("http://example.com/b/c/d;p?q", "/../g")); assertEquals("http://example.com/b/c/g.", resolve("http://example.com/b/c/d;p?q", "g.")); assertEquals("http://example.com/b/c/.g", resolve("http://example.com/b/c/d;p?q", ".g")); assertEquals("http://example.com/b/c/g..", resolve("http://example.com/b/c/d;p?q", "g..")); assertEquals("http://example.com/b/c/..g", resolve("http://example.com/b/c/d;p?q", "..g")); assertEquals("http://example.com/b/g", resolve("http://example.com/b/c/d;p?q", "./../g")); assertEquals("http://example.com/b/c/g/", resolve("http://example.com/b/c/d;p?q", "./g/.")); assertEquals("http://example.com/b/c/g/h", resolve("http://example.com/b/c/d;p?q", "g/./h")); assertEquals("http://example.com/b/c/h", resolve("http://example.com/b/c/d;p?q", "g/../h")); assertEquals("http://example.com/b/c/g;x=1/y", resolve("http://example.com/b/c/d;p?q", "g;x=1/./y")); assertEquals("http://example.com/b/c/y", resolve("http://example.com/b/c/d;p?q", "g;x=1/../y")); assertEquals("http://example.com/b/c/g?y/./x", resolve("http://example.com/b/c/d;p?q", "g?y/./x")); assertEquals("http://example.com/b/c/g?y/../x", resolve("http://example.com/b/c/d;p?q", "g?y/../x")); assertEquals("http://example.com/b/c/g#s/./x", resolve("http://example.com/b/c/d;p?q", "g#s/./x")); assertEquals("http://example.com/b/c/g#s/../x", resolve("http://example.com/b/c/d;p?q", "g#s/../x")); } @Test void stripsControlCharsFromUrls() { // should resovle to an absolute url: assertEquals("foo:bar", resolve("\nhttps://\texample.com/", "\r\nfo\to:ba\br")); } @Test void allowsSpaceInUrl() { assertEquals("https://example.com/foo bar/", resolve("HTTPS://example.com/example/", "../foo bar/")); } @Test void isAscii() { assertTrue(StringUtil.isAscii("")); assertTrue(StringUtil.isAscii("example.com")); assertTrue(StringUtil.isAscii("One Two")); assertFalse(StringUtil.isAscii("🧔")); assertFalse(StringUtil.isAscii("测试")); assertFalse(StringUtil.isAscii("测试.com")); } @Test void isAsciiLetter() { assertTrue(StringUtil.isAsciiLetter('a')); assertTrue(StringUtil.isAsciiLetter('n')); assertTrue(StringUtil.isAsciiLetter('z')); assertTrue(StringUtil.isAsciiLetter('A')); assertTrue(StringUtil.isAsciiLetter('N')); assertTrue(StringUtil.isAsciiLetter('Z')); assertFalse(StringUtil.isAsciiLetter(' ')); assertFalse(StringUtil.isAsciiLetter('-')); assertFalse(StringUtil.isAsciiLetter('0')); assertFalse(StringUtil.isAsciiLetter('ß')); assertFalse(StringUtil.isAsciiLetter('Ě')); } @Test void isDigit() { assertTrue(StringUtil.isDigit('0')); assertTrue(StringUtil.isDigit('1')); assertTrue(StringUtil.isDigit('2')); assertTrue(StringUtil.isDigit('3')); assertTrue(StringUtil.isDigit('4')); assertTrue(StringUtil.isDigit('5')); assertTrue(StringUtil.isDigit('6')); assertTrue(StringUtil.isDigit('7')); assertTrue(StringUtil.isDigit('8')); assertTrue(StringUtil.isDigit('9')); assertFalse(StringUtil.isDigit('a')); assertFalse(StringUtil.isDigit('A')); assertFalse(StringUtil.isDigit('ä')); assertFalse(StringUtil.isDigit('Ä')); assertFalse(StringUtil.isDigit('١')); assertFalse(StringUtil.isDigit('୳')); } @Test void isHexDigit() { assertTrue(StringUtil.isHexDigit('0')); assertTrue(StringUtil.isHexDigit('1')); assertTrue(StringUtil.isHexDigit('2')); assertTrue(StringUtil.isHexDigit('3')); assertTrue(StringUtil.isHexDigit('4')); assertTrue(StringUtil.isHexDigit('5')); assertTrue(StringUtil.isHexDigit('6')); assertTrue(StringUtil.isHexDigit('7')); assertTrue(StringUtil.isHexDigit('8')); assertTrue(StringUtil.isHexDigit('9')); assertTrue(StringUtil.isHexDigit('a')); assertTrue(StringUtil.isHexDigit('b')); assertTrue(StringUtil.isHexDigit('c')); assertTrue(StringUtil.isHexDigit('d')); assertTrue(StringUtil.isHexDigit('e')); assertTrue(StringUtil.isHexDigit('f')); assertTrue(StringUtil.isHexDigit('A')); assertTrue(StringUtil.isHexDigit('B')); assertTrue(StringUtil.isHexDigit('C')); assertTrue(StringUtil.isHexDigit('D')); assertTrue(StringUtil.isHexDigit('E')); assertTrue(StringUtil.isHexDigit('F')); assertFalse(StringUtil.isHexDigit('g')); assertFalse(StringUtil.isHexDigit('G')); assertFalse(StringUtil.isHexDigit('ä')); assertFalse(StringUtil.isHexDigit('Ä')); assertFalse(StringUtil.isHexDigit('١')); assertFalse(StringUtil.isHexDigit('୳')); } } ================================================ FILE: src/test/java/org/jsoup/nodes/AttributeTest.java ================================================ package org.jsoup.nodes; import org.jsoup.Jsoup; import org.jsoup.parser.ParseSettings; import org.jsoup.parser.Parser; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.assertEquals; public class AttributeTest { @Test public void html() { Attribute attr = new Attribute("key", "value &"); assertEquals("key=\"value &\"", attr.html()); assertEquals(attr.html(), attr.toString()); } @Test public void htmlWithLtAndGtInValue() { Attribute attr = new Attribute("key", ""); assertEquals("key=\"<value>\"", attr.html()); } @Test public void testWithSupplementaryCharacterInAttributeKeyAndValue() { String s = new String(Character.toChars(135361)); Attribute attr = new Attribute(s, "A" + s + "B"); assertEquals(s + "=\"A" + s + "B\"", attr.html()); assertEquals(attr.html(), attr.toString()); } @Test public void validatesKeysNotEmpty() { assertThrows(IllegalArgumentException.class, () -> new Attribute(" ", "Check")); } @Test public void validatesKeysNotEmptyViaSet() { assertThrows(IllegalArgumentException.class, () -> { Attribute attr = new Attribute("One", "Check"); attr.setKey(" "); }); } @Test public void booleanAttributesAreEmptyStringValues() { Document doc = Jsoup.parse("