Repository: ospfranco/link-preview-js Branch: main Commit: 98da401d4066 Files: 22 Total size: 645.8 KB Directory structure: gitextract_p10xncbh/ ├── .github/ │ ├── FUNDING.yml │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.md │ │ └── feature_request.md │ └── workflows/ │ ├── publish.yml │ └── tests.yml ├── .gitignore ├── .npmignore ├── .prettierignore ├── .prettierrc.json ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── __tests__/ │ ├── __snapshots__/ │ │ └── index.spec.ts.snap │ ├── index.spec.ts │ └── sampleResponse.json ├── bump-version.sh ├── constants.ts ├── index.ts ├── jest.config.js ├── mise.toml ├── package.json └── tsconfig.json ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/FUNDING.yml ================================================ # These are supported funding model platforms github: [ospfranco] ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: Bug report about: Create a report to help us improve title: '' labels: '' assignees: '' --- **Describe the bug** A clear and concise description of what the bug is. **To Reproduce** Steps to reproduce the behavior: 1. Go to '...' 2. Click on '....' 3. Scroll down to '....' 4. See error **Expected behavior** A clear and concise description of what you expected to happen. **Screenshots** If applicable, add screenshots to help explain your problem. **Desktop (please complete the following information):** - OS: [e.g. iOS] - Browser [e.g. chrome, safari] - Version [e.g. 22] **Smartphone (please complete the following information):** - Device: [e.g. iPhone6] - OS: [e.g. iOS8.1] - Browser [e.g. stock browser, safari] - Version [e.g. 22] **Additional context** Add any other context about the problem here. ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.md ================================================ --- name: Feature request about: Suggest an idea for this project title: '' labels: '' assignees: '' --- **Is your feature request related to a problem? Please describe.** A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] **Describe the solution you'd like** A clear and concise description of what you want to happen. **Describe alternatives you've considered** A clear and concise description of any alternative solutions or features you've considered. **Additional context** Add any other context or screenshots about the feature request here. ================================================ FILE: .github/workflows/publish.yml ================================================ name: "publish" on: push: tags: - "*" jobs: publish: permissions: contents: write runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-node@v4 with: node-version: "22" - name: Cache dependencies id: yarn-cache uses: actions/cache@v3 with: path: | **/node_modules .yarn/install-state.gz key: ${{ runner.os }}-yarn-${{ hashFiles('yarn.lock') }}-${{ hashFiles('**/package.json', '!node_modules/**') }} restore-keys: | ${{ runner.os }}-yarn-${{ hashFiles('yarn.lock') }} ${{ runner.os }}-yarn- - run: yarn install - name: Update package.json version run: | TAG=${GITHUB_REF#refs/tags/} jq --arg version "$TAG" '.version = $version' package.json > tmp.$$.json && mv tmp.$$.json package.json - name: Compile typescript run: yarn build - uses: JS-DevTools/npm-publish@v3 with: token: ${{ secrets.NPM_TOKEN }} - name: Create a Release uses: elgohr/Github-Release-Action@v5 env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: title: Release ${{ github.ref_name }} tag: ${{ github.ref_name }} ================================================ FILE: .github/workflows/tests.yml ================================================ name: Tests on: pull_request jobs: tests: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 - name: Cache dependencies id: yarn-cache uses: actions/cache@v3 with: path: | **/node_modules .yarn/install-state.gz key: ${{ runner.os }}-yarn-${{ hashFiles('yarn.lock') }}-${{ hashFiles('**/package.json', '!node_modules/**') }} restore-keys: | ${{ runner.os }}-yarn-${{ hashFiles('yarn.lock') }} ${{ runner.os }}-yarn- - name: Install dependencies run: yarn - name: Run tests run: yarn test ================================================ FILE: .gitignore ================================================ node_modules/ .vscode/ build/ .DS_Store ================================================ FILE: .npmignore ================================================ .github __tests__ .vscode .eslintrc.js index.ts jest.config.js LICENSE README.md tsconfig.json typings.d.ts ================================================ FILE: .prettierignore ================================================ build ================================================ FILE: .prettierrc.json ================================================ {} ================================================ FILE: CONTRIBUTING.md ================================================ If you add new functionality please make sure you add the appropiate tests and the tests are running. Please also try to keep the style consistant, linter use is a must. ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2019 Oscar Franco Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================

Link Preview JS

npm i link-preview-js

**Before creating an issue** It's more than likely there is nothing wrong with the library: - It's very simple; fetch HTML, parse HTML, and search for OpenGraph HTML tags. - Unless HTML or the OpenGraph standard change, the library will not break - If the target website you are trying to preview redirects you to a login page **the preview will fail**, because it will parse the login page - If the target website does not have OpenGraph tags **the preview will most likely fail**, there are some fallbacks but in general, it will not work - **You cannot preview (fetch) another web page from YOUR web page. This is an intentional security feature of browsers called CORS**

DO NOT FETCH CONTENT DIRECTLY FROM A USERS DEVICE. ONLY RUN THIS IN YOUR SERVER AND SANDBOX IT IF YOU CAN

Browsers block this via cors, but you might be clever like a fox and run this in React Native. This is a bad idea, you are exposing the device user to potentially malicious links If you use this library and find it useful please consider [sponsoring me](https://github.com/sponsors/ospfranco), open source takes a lot of time and effort. # Link Preview Allows you to extract information from an HTTP URL/link (or parse an HTML string) and retrieve meta information such as title, description, images, videos, etc. via **OpenGraph** tags. ## Discord Join the Discord https://discord.gg/W9XmqCQCKP ## GOTCHAs - You cannot request a different domain from your web app (Browsers block cross-origin-requests). If you don't know how _same-origin-policy_ works, [here is a good intro](https://dev.to/lydiahallie/cs-visualized-cors-5b8h), therefore **this library works on Node.js and certain mobile run-times (Cordova or React-Native)**. - **This library acts as if the user would visit the page, sites might re-direct you to sign-up pages, consent screens, etc.** You can try to change the user-agent header (try with `google-bot` or with `Twitterbot`), but you need to work around these issues yourself. ## API `getLinkPreview`: you have to pass a string, doesn't matter if it is just a URL or a piece of text that contains a URL, the library will take care of parsing it and returning the info o the first valid HTTP(S) URL info it finds. `getPreviewFromContent`: useful for passing a pre-fetched Response object from an existing async/etc. call. Refer to the example below for required object values. ```typescript import { getLinkPreview, getPreviewFromContent } from "link-preview-js"; // pass the link directly getLinkPreview("https://www.youtube.com/watch?v=MejbOFk7H6c").then((data) => console.debug(data) ); ////////////////////////// OR ////////////////////////// // pass a chunk of text getLinkPreview( "This is a text supposed to be parsed and the first link displayed https://www.youtube.com/watch?v=MejbOFk7H6c" ).then((data) => console.debug(data)); ////////////////////////// OR ////////////////////////// // pass a pre-fetched response object // The passed response object should include, at minimum: // { // data: '...', // response content // headers: { // ... // // should include content-type // content-type: "text/html; charset=ISO-8859-1", // ... // }, // url: 'https://domain.com/' // resolved url // } yourAjaxCall(url, (response) => { getPreviewFromContent(response).then((data) => console.debug(data)); }); ``` ## Options Additionally, you can pass an options object which should add more functionality to the parsing of the link | Property Name | Result | | -------------------------------------------------------------------------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | | imagesPropertyType (**optional**) (ex: 'og') | Fetches images only with the specified property, `meta[property='${imagesPropertyType}:image']` | | headers (**optional**) (ex: { 'user-agent': 'googlebot', 'Accept-Language': 'en-US' }) | Add request headers to fetch call | | timeout (**optional**) (ex: 1000) | Timeout for the request to fail | | followRedirects (**optional**) (default 'error') | For security reasons, the library does not automatically follow redirects ('error' value), a malicious agent can exploit redirects to steal data, posible values: ('error', 'follow', 'manual') | | handleRedirects (**optional**) (with followRedirects 'manual') | When followRedirects is set to 'manual' you need to pass a function that validates if the redirectinon is secure, below you can find an example | | resolveDNSHost (**optional**) | Function that resolves the final address of the detected/parsed URL to prevent SSRF attacks | | onResponse (**optional**) | Function that handles the response object to allow for managing special cases | ```javascript getLinkPreview("https://www.youtube.com/watch?v=MejbOFk7H6c", { imagesPropertyType: "og", // fetches only open-graph images headers: { "user-agent": "googlebot", // fetches with googlebot crawler user agent "Accept-Language": "fr-CA", // fetches site for French language // ...other optional HTTP request headers }, timeout: 1000, }).then((data) => console.debug(data)); ``` ## SSRF Concerns Doing requests on behalf of your users or using user-provided URLs is dangerous. One of such attack is trying to fetch a domain that redirects to localhost so the users get the contents of your server (doesn't affect mobile runtimes). To mitigate this attack you can use the resolveDNSHost option: ```ts // example how to use node's dns resolver const dns = require("node:dns"); getLinkPreview("http://maliciousLocalHostRedirection.com", { resolveDNSHost: async (url: string) => { return new Promise((resolve, reject) => { const hostname = new URL(url).hostname; dns.lookup(hostname, (err, address, family) => { if (err) { reject(err); return; } resolve(address); // if address resolves to localhost or '127.0.0.1' library will throw an error }); }); }, }).catch((e) => { // will throw a detected redirection to localhost }); ``` This might add some latency to your request but prevents loopback attacks. ## Redirections Same to SSRF, following redirections is dangerous, the library errors by default when the response tries to redirect the user. There are however some simple redirections that are valid (e.g. HTTP to HTTPS) and you might want to allow them, you can do it via: ```ts await getLinkPreview(`http://google.com/`, { followRedirects: `manual`, handleRedirects: (baseURL: string, forwardedURL: string) => { const urlObj = new URL(baseURL); const forwardedURLObj = new URL(forwardedURL); if ( forwardedURLObj.hostname === urlObj.hostname || forwardedURLObj.hostname === "www." + urlObj.hostname || "www." + forwardedURLObj.hostname === urlObj.hostname ) { return true; } else { return false; } }, }); ``` ## onResponse In some cases the website might be missing OpenGraph tags and you might want to provide your own custom logic to try to parse data. For example, if the library is unable to detect a description, you might want to use the text value of the first paragraph instead. This callback gives you access to the Cheerio (the library internally used to parse the HTML) instance, as well as the URL object so you could handle cases on a site-by-site basis, if you need to. This callback must return the modified response object ```javascript await getLinkPreview(`https://example.com/`, { onResponse: (response, doc, URL) => { if (URL.hostname == 'example.com') { response.siteName = 'Example Website'; } if (!response.description) { response.description = doc('p').first().text(); } return response; }, }); ``` ## Response Returns a Promise that resolves with an object describing the provided link. The info object returned varies depending on the content type (MIME type) returned in the HTTP response (see below for variations of response). Rejects with an error if the response can not be parsed or if there was no URL in the text provided. ### Text/HTML URL ```javascript { url: "https://www.youtube.com/watch?v=MejbOFk7H6c", title: "OK Go - Needing/Getting - Official Video - YouTube", siteName: "YouTube", description: "Buy the video on iTunes: https://itunes.apple.com/us/album/needing-getting-bundle-ep/id508124847 See more about the guitars at: http://www.gretschguitars.com...", images: ["https://i.ytimg.com/vi/MejbOFk7H6c/maxresdefault.jpg"], mediaType: "video.other", contentType: "text/html", charset: "utf-8" videos: [], favicons:["https://www.youtube.com/yts/img/favicon_32-vflOogEID.png","https://www.youtube.com/yts/img/favicon_48-vflVjB_Qk.png","https://www.youtube.com/yts/img/favicon_96-vflW9Ec0w.png","https://www.youtube.com/yts/img/favicon_144-vfliLAfaB.png","https://s.ytimg.com/yts/img/favicon-vfl8qSV2F.ico"] } ``` ### Image URL ```javascript { url: "https://media.npr.org/assets/img/2018/04/27/gettyimages-656523922nunes-4bb9a194ab2986834622983bb2f8fe57728a9e5f-s1100-c15.jpg", mediaType: "image", contentType: "image/jpeg", favicons: [ "https://media.npr.org/favicon.ico" ] } ``` ### Audio URL ```javascript { url: "https://ondemand.npr.org/anon.npr-mp3/npr/atc/2007/12/20071231_atc_13.mp3", mediaType: "audio", contentType: "audio/mpeg", favicons: [ "https://ondemand.npr.org/favicon.ico" ] } ``` ### Video URL ```javascript { url: "https://www.w3schools.com/html/mov_bbb.mp4", mediaType: "video", contentType: "video/mp4", favicons: [ "https://www.w3schools.com/favicon.ico" ] } ``` ### Application URL ```javascript { url: "https://assets.curtmfg.com/masterlibrary/56282/installsheet/CME_56282_INS.pdf", mediaType: "application", contentType: "application/pdf", favicons: [ "https://assets.curtmfg.com/favicon.ico" ] } ``` ## License MIT license ================================================ FILE: __tests__/__snapshots__/index.spec.ts.snap ================================================ // Jest Snapshot v1, https://goo.gl/fbAQLP exports[`#getLinkPreview() no link in text should fail gracefully 1`] = `"link-preview-js did not receive a valid a url or text"`; exports[`#getLinkPreview() should handle empty strings gracefully 1`] = `"link-preview-js did not receive a valid url or text"`; exports[`#getLinkPreview() should handle malformed urls gracefully 1`] = `"link-preview-js did not receive a valid a url or text"`; ================================================ FILE: __tests__/index.spec.ts ================================================ import { getLinkPreview, getPreviewFromContent } from "../index"; import prefetchedResponse from "./sampleResponse.json"; describe(`#getLinkPreview()`, () => { it(`should extract link info from just URL`, async () => { const linkInfo: any = await getLinkPreview( `https://www.youtube.com/watch?v=wuClZjOdT30`, { headers: { "Accept-Language": `en-US` } } ); expect(linkInfo.url).toEqual(`https://www.youtube.com/watch?v=wuClZjOdT30`); expect(linkInfo.siteName).toEqual(`YouTube`); expect(linkInfo.title).toEqual(`Geography Now! Germany`); expect(linkInfo.description).toBeTruthy(); expect(linkInfo.mediaType).toEqual(`video.other`); expect(linkInfo.images.length).toEqual(1); expect(linkInfo.images[0]).toEqual( `https://i.ytimg.com/vi/wuClZjOdT30/maxresdefault.jpg` ); expect(linkInfo.videos.length).toEqual(0); expect(linkInfo.favicons[0]).not.toBe(``); expect(linkInfo.contentType.toLowerCase()).toEqual(`text/html`); expect(linkInfo.charset?.toLowerCase()).toEqual(`utf-8`); }); it("returns charset of website", async () => { const linkInfo: any = await getLinkPreview(`https://www.pravda.com.ua`); expect(linkInfo.url).toEqual(`https://www.pravda.com.ua/`); expect(linkInfo.contentType.toLowerCase()).toEqual(`text/html`); expect(linkInfo.charset?.toLowerCase()).toEqual(`utf-8`); }); xit("should extract author from news article", async () => { const linkInfo: any = await getLinkPreview( `https://www.usatoday.com/story/special/contributor-content/2025/10/15/why-chaos-engineering-is-more-important-than-ever-in-the-ai-era/86712877007/` ); expect(linkInfo.author).toEqual(`Matt Emma`); }); it(`should extract link info from a URL with a newline`, async () => { const linkInfo: any = await getLinkPreview( ` https://www.youtube.com/watch?v=wuClZjOdT30 `, { headers: { "Accept-Language": `en-US` } } ); expect(linkInfo.url).toEqual(`https://www.youtube.com/watch?v=wuClZjOdT30`); expect(linkInfo.title).toEqual(`Geography Now! Germany`); expect(linkInfo.siteName).toBeTruthy(); expect(linkInfo.description).toBeTruthy(); expect(linkInfo.mediaType).toEqual(`video.other`); expect(linkInfo.images.length).toEqual(1); expect(linkInfo.images[0]).toEqual( `https://i.ytimg.com/vi/wuClZjOdT30/maxresdefault.jpg` ); expect(linkInfo.videos.length).toEqual(0); expect(linkInfo.favicons[0]).not.toBe(``); expect(linkInfo.contentType.toLowerCase()).toEqual(`text/html`); }); it(`should extract link info from just text with a URL`, async () => { const linkInfo: any = await getLinkPreview( `This is some text blah blah https://www.youtube.com/watch?v=wuClZjOdT30 and more text`, { headers: { "Accept-Language": `en-US` } } ); expect(linkInfo.url).toEqual(`https://www.youtube.com/watch?v=wuClZjOdT30`); expect(linkInfo.title).toEqual(`Geography Now! Germany`); expect(linkInfo.siteName).toEqual(`YouTube`); expect(linkInfo.description).toBeTruthy(); expect(linkInfo.mediaType).toEqual(`video.other`); expect(linkInfo.images.length).toEqual(1); expect(linkInfo.images[0]).toEqual( `https://i.ytimg.com/vi/wuClZjOdT30/maxresdefault.jpg` ); expect(linkInfo.videos.length).toEqual(0); expect(linkInfo.favicons[0]).toBeTruthy(); expect(linkInfo.contentType.toLowerCase()).toEqual(`text/html`); }); // it(`should make request with different languages`, async () => { // let linkInfo: any = await getLinkPreview(`https://www.wikipedia.org/`, { // headers: { "Accept-Language": `es` }, // followRedirects: `follow`, // }); // expect(linkInfo.title).toContain(`Wikipedia, la enciclopedia libre`); // linkInfo = await getLinkPreview(`https://www.wikipedia.org/`); // expect(linkInfo.title).toContain(`Wikipedia`); // }); it(`should handle audio urls`, async () => { const linkInfo = await getLinkPreview( `https://ondemand.npr.org/anon.npr-mp3/npr/atc/2007/12/20071231_atc_13.mp3` ); expect(linkInfo.url).toEqual( `https://ondemand.npr.org/anon.npr-mp3/npr/atc/2007/12/20071231_atc_13.mp3` ); expect(linkInfo.mediaType).toEqual(`audio`); expect(linkInfo.contentType?.toLowerCase()).toEqual(`audio/mpeg`); expect(linkInfo.favicons[0]).toBeTruthy(); }); it(`should handle video urls`, async () => { const linkInfo = await getLinkPreview( `https://www.w3schools.com/html/mov_bbb.mp4` ); expect(linkInfo.url).toEqual(`https://www.w3schools.com/html/mov_bbb.mp4`); expect(linkInfo.mediaType).toEqual(`video`); expect(linkInfo.contentType?.toLowerCase()).toEqual(`video/mp4`); expect(linkInfo.favicons[0]).toBeTruthy(); }); it(`should handle image urls`, async () => { const linkInfo = await getLinkPreview( `https://media.npr.org/assets/img/2018/04/27/gettyimages-656523922nunes-4bb9a194ab2986834622983bb2f8fe57728a9e5f-s1100-c15.jpg` ); expect(linkInfo.url).toEqual( `https://media.npr.org/assets/img/2018/04/27/gettyimages-656523922nunes-4bb9a194ab2986834622983bb2f8fe57728a9e5f-s1100-c15.jpg` ); expect(linkInfo.mediaType).toEqual(`image`); expect(linkInfo.contentType?.toLowerCase()).toEqual(`image/jpeg`); expect(linkInfo.favicons[0]).toBeTruthy(); }); it(`should handle unknown content type urls`, async () => { const linkInfo = await getLinkPreview(`https://mjml.io/try-it-live`); expect(linkInfo.url).toEqual(`https://mjml.io/try-it-live`); expect(linkInfo.mediaType).toEqual(`website`); }); // This site changed? it is not returning application any more but rather website // it.skip(`should handle application urls`, async () => { // const linkInfo = await getLinkPreview( // `https://assets.curtmfg.com/masterlibrary/56282/installsheet/CME_56282_INS.pdf` // ); // expect(linkInfo.url).toEqual( // `https://assets.curtmfg.com/masterlibrary/56282/installsheet/CME_56282_INS.pdf` // ); // expect(linkInfo.mediaType).toEqual(`application`); // expect(linkInfo.contentType?.toLowerCase()).toEqual(`application/pdf`); // expect(linkInfo.favicons[0]).toBeTruthy(); // }); it(`no link in text should fail gracefully`, async () => { await expect( getLinkPreview(`no link`) ).rejects.toThrowErrorMatchingSnapshot(); }); it(`should handle malformed urls gracefully`, async () => { await expect( getLinkPreview( `this is a malformed link: ahttps://www.youtube.com/watch?v=wuClZjOdT30` ) ).rejects.toThrowErrorMatchingSnapshot(); }); it(`should handle empty strings gracefully`, async () => { await expect(getLinkPreview(``)).rejects.toThrowErrorMatchingSnapshot(); }); it.skip(`should handle a proxy url option`, async () => { // origin header is required by cors-anywhere const linkInfo: any = await getLinkPreview( `https://www.youtube.com/watch?v=wuClZjOdT30`, { proxyUrl: `https://cors-anywhere.herokuapp.com/`, headers: { Origin: `http://localhost:8000`, "Accept-Language": `en-US`, }, } ); expect(linkInfo.url).toEqual(`https://www.youtube.com/watch?v=wuClZjOdT30`); expect(linkInfo.siteName).toEqual(`YouTube`); expect(linkInfo.title).toEqual(`Geography Now! Germany`); expect(linkInfo.description).toBeTruthy(); expect(linkInfo.mediaType).toEqual(`video.other`); expect(linkInfo.images.length).toEqual(1); expect(linkInfo.images[0]).toEqual( `https://i.ytimg.com/vi/wuClZjOdT30/maxresdefault.jpg` ); expect(linkInfo.videos.length).toEqual(0); expect(linkInfo.favicons[0]).not.toBe(``); expect(linkInfo.contentType.toLowerCase()).toEqual(`text/html`); }); it("should timeout (default 3s) with infinite loading link", async () => { try { await getLinkPreview( `https://www.gamestop.com/video-games/pc-gaming/components/cooling/products/hyper-212-rgb-black-edition-fan/185243.html?gclid=Cj0KCQjwraqHBhDsARIsAKuGZeECDlqkF2cxpcuS0xRxQmrv5BxFawWS_B51kiqehPf64_KlO0oyunsaAhn5EALw_wcB&gclsrc=aw.ds` ); } catch (e: any) { expect(e.message).toEqual("Request timeout"); } }); it("should timeout (custom 1s) with infinite loading link", async () => { try { await getLinkPreview( `https://www.gamestop.com/video-games/pc-gaming/components/cooling/products/hyper-212-rgb-black-edition-fan/185243.html?gclid=Cj0KCQjwraqHBhDsARIsAKuGZeECDlqkF2cxpcuS0xRxQmrv5BxFawWS_B51kiqehPf64_KlO0oyunsaAhn5EALw_wcB&gclsrc=aw.ds`, { timeout: 1000, } ); } catch (e: any) { expect(e.message).toEqual("Request timeout"); } }); it(`should handle followRedirects option is error`, async () => { try { await getLinkPreview(`http://google.com/`, { followRedirects: `error` }); } catch (e: any) { expect(e.message).toEqual(`fetch failed`); } }); it(`should handle followRedirects option is manual but handleRedirects was not provided`, async () => { try { await getLinkPreview(`http://google.com/`, { followRedirects: `manual` }); } catch (e: any) { expect(e.message).toEqual( `link-preview-js followRedirects is set to manual, but no handleRedirects function was provided` ); } }); it(`should handle followRedirects option is manual with handleRedirects function`, async () => { const response = await getLinkPreview(`http://google.com/`, { followRedirects: `manual`, handleRedirects: (baseURL: string, forwardedURL: string) => { if (forwardedURL !== `http://www.google.com/`) { return false; } return true; }, }); expect(response.contentType).toEqual(`text/html`); expect(response.url).toEqual(`http://www.google.com/`); expect(response.mediaType).toEqual(`website`); }); it("should handle override response body using onResponse option", async () => { let firstParagraphText; const res: any = await getLinkPreview(`https://www.example.com/`, { onResponse: (result, doc) => { firstParagraphText = doc("p") .first() .text() .split("\n") .map((x) => x.trim()) .join(" "); result.siteName = `SiteName has been overridden`; result.description = firstParagraphText; return result; }, }); expect(res.siteName).toEqual("SiteName has been overridden"); expect(res.description).toEqual(firstParagraphText); }); it("should handle video tags without type or secure_url tags", async () => { const res: any = await getLinkPreview( `https://newpathtitle.com/falling-markets-how-to-stop-buyer-from-getting-out/`, { followRedirects: `follow` } ); expect(res.siteName).toEqual(`New Path Title`); expect(res.title).toEqual( `Falling Markets: How To Stop A Buyer From Getting Out | New Path Title` ); expect(res.description).toBeTruthy(); expect(res.mediaType).toEqual(`article`); expect(res.images.length).toBeGreaterThan(0); expect(res.videos.length).toBeGreaterThan(0); expect(res.videos[0].url).toEqual( `https://www.youtube.com/embed/nqNXjxpAPkU` ); expect(res.favicons.length).toBeGreaterThan(0); expect(res.contentType.toLowerCase()).toEqual(`text/html`); }); }); describe(`#getPreviewFromContent`, () => { it(`Basic parsing`, async () => { const linkInfo: any = await getPreviewFromContent(prefetchedResponse); expect(linkInfo.url).toEqual(`https://www.youtube.com/watch?v=wuClZjOdT30`); expect(linkInfo.siteName).toEqual(`YouTube`); expect(linkInfo.title).toEqual(`Geography Now! Germany`); expect(linkInfo.description).toBeTruthy(); expect(linkInfo.mediaType).toEqual(`video.other`); expect(linkInfo.images.length).toEqual(1); expect(linkInfo.images[0]).toEqual( `https://i.ytimg.com/vi/wuClZjOdT30/maxresdefault.jpg` ); expect(linkInfo.videos.length).toEqual(0); expect(linkInfo.favicons[0]).not.toBe(``); expect(linkInfo.contentType.toLowerCase()).toEqual(`text/html`); }); }); ================================================ FILE: __tests__/sampleResponse.json ================================================ {"url":"https://www.youtube.com/watch?v=wuClZjOdT30","headers":{"alt-svc":"h3-29=\":443\"; ma=2592000,h3-T051=\":443\"; ma=2592000,h3-Q050=\":443\"; ma=2592000,h3-Q046=\":443\"; ma=2592000,h3-Q043=\":443\"; ma=2592000,quic=\":443\"; ma=2592000; v=\"46,43\"","cache-control":"no-cache, no-store, max-age=0, must-revalidate","connection":"close","content-encoding":"gzip","content-type":"text/html; charset=utf-8","date":"Fri, 11 Dec 2020 19:36:56 GMT","expires":"Mon, 01 Jan 1990 00:00:00 GMT","p3p":"CP=\"This is not a P3P policy! See http://support.google.com/accounts/answer/151657?hl=en for more info.\"","pragma":"no-cache","server":"ESF","set-cookie":"YSC=jCz2JZmjRfE; Domain=.youtube.com; Path=/; Secure; HttpOnly; SameSite=none, VISITOR_INFO1_LIVE=oa3D7kCWj44; Domain=.youtube.com; Expires=Wed, 09-Jun-2021 19:36:56 GMT; Path=/; Secure; HttpOnly; SameSite=none","strict-transport-security":"max-age=31536000","transfer-encoding":"chunked","x-content-type-options":"nosniff","x-frame-options":"SAMEORIGIN","x-xss-protection":"0"},"data":"Geography Now! Germany - YouTube
AboutPressCopyrightContact usCreatorsAdvertiseDevelopersImpressumNetzDG TransparenzberichtNetzDG ComplaintsTermsPrivacyPolicy & SafetyHow YouTube worksTest new features
© 2020 Google LLC
"} ================================================ FILE: bump-version.sh ================================================ #!/bin/bash set -ex npm --no-git-tag-version version patch git add . git commit -m "Bump version" # git tag $PACKAGE_VERSION git push ================================================ FILE: constants.ts ================================================ export const CONSTANTS = { REGEX_VALID_URL: new RegExp( "^" + // protocol identifier "(?:(?:https?|ftp)://)" + // user:pass authentication "(?:\\S+(?::\\S*)?@)?" + "(?:" + // IP address exclusion // private & local networks "(?!(?:10|127)(?:\\.\\d{1,3}){3})" + "(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" + "(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" + // IP address dotted notation octets // excludes loopback network 0.0.0.0 // excludes reserved space >= 224.0.0.0 // excludes network & broacast addresses // (first & last IP address of each class) "(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" + "(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" + "(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" + "|" + // host name "(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)" + // domain name "(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*" + // TLD identifier "(?:\\.(?:[a-z\\u00a1-\\uffff]{2,}))" + // TLD may end with dot "\\.?" + ")" + // port number "(?::\\d{2,5})?" + // resource path "(?:[/?#]\\S*)?" + "$", "i" ), REGEX_LOOPBACK: new RegExp( "^" + // Loopback: 127.0.0.0 - 127.255.255.255 "(?:127(?:\\.\\d{1,3}){3})" + "|" + // Private Class A: 10.0.0.0 - 10.255.255.255 "(?:10(?:\\.\\d{1,3}){3})" + "|" + // Private Class B: 172.16.0.0 - 172.31.255.255 "(?:172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" + "|" + // Private Class C: 192.168.0.0 - 192.168.255.255 "(?:192\\.168(?:\\.\\d{1,3}){2})" + "|" + // Link-local: 169.254.0.0 - 169.254.255.255 "(?:169\\.254(?:\\.\\d{1,3}){2})" + "|" + // Documentation: 192.0.2.0/24, 198.51.100.0/24, 203.0.113.0/24 "(?:192\\.0\\.2(?:\\.\\d{1,3}){1})" + "|" + "(?:198\\.51\\.100(?:\\.\\d{1,3}){1})" + "|" + "(?:203\\.0\\.113(?:\\.\\d{1,3}){1})" + "|" + // Carrier-Grade NAT (CGNAT): 100.64.0.0 - 100.127.255.255 "(?:100\\.(?:6[4-9]|[7-9]\\d|1[0-1]\\d)(?:\\.\\d{1,3}){2})" + "$", "i" ), REGEX_CONTENT_TYPE_IMAGE: new RegExp("image/.*", "i"), REGEX_CONTENT_TYPE_AUDIO: new RegExp("audio/.*", "i"), REGEX_CONTENT_TYPE_VIDEO: new RegExp("video/.*", "i"), REGEX_CONTENT_TYPE_TEXT: new RegExp("text/.*", "i"), REGEX_CONTENT_TYPE_APPLICATION: new RegExp("application/.*", "i"), }; ================================================ FILE: index.ts ================================================ import cheerio from "cheerio"; import { CONSTANTS } from "./constants"; interface ILinkPreviewResponse { url: string; title: string; siteName: string | undefined; author: string | undefined; description: string | undefined; mediaType: string; contentType: string | undefined; images: string[]; videos: IVideoType[]; favicons: string[]; } interface IVideoType { url: string | undefined; secureUrl: string | null | undefined; type: string | null | undefined; width: string | undefined; height: string | undefined; } interface ILinkPreviewOptions { headers?: Record; imagesPropertyType?: string; proxyUrl?: string; timeout?: number; followRedirects?: `follow` | `error` | `manual`; resolveDNSHost?: (url: string) => Promise; handleRedirects?: (baseURL: string, forwardedURL: string) => boolean; onResponse?: ( response: ILinkPreviewResponse, doc: cheerio.Root, url?: URL, ) => ILinkPreviewResponse; } interface IPreFetchedResource { headers: Record; status?: number; imagesPropertyType?: string; proxyUrl?: string; url: string; data: string; } function throwOnLoopback(address: string) { if (CONSTANTS.REGEX_LOOPBACK.test(address)) { throw new Error("SSRF request detected, trying to query host"); } } function metaTag(doc: cheerio.Root, type: string, attr: string) { const nodes = doc(`meta[${attr}='${type}']`); return nodes.length ? nodes : null; } function metaTagContent(doc: cheerio.Root, type: string, attr: string) { return doc(`meta[${attr}='${type}']`).attr(`content`); } function getTitle(doc: cheerio.Root) { let title = metaTagContent(doc, `og:title`, `property`) || metaTagContent(doc, `og:title`, `name`); if (!title) { title = doc(`head > title`).text(); } return title; } function getSiteName(doc: cheerio.Root) { const siteName = metaTagContent(doc, `og:site_name`, `property`) || metaTagContent(doc, `og:site_name`, `name`); return siteName; } function getAuthor(doc: cheerio.Root) { const author = metaTagContent(doc, `author`, `name`) || metaTagContent(doc, `article:author`, `property`); return author; } function getDescription(doc: cheerio.Root) { const description = metaTagContent(doc, `description`, `name`) || metaTagContent(doc, `Description`, `name`) || metaTagContent(doc, `og:description`, `property`); return description; } function getMediaType(doc: cheerio.Root) { const node = metaTag(doc, `medium`, `name`); if (node) { const content = node.attr(`content`); return content === `image` ? `photo` : content; } return ( metaTagContent(doc, `og:type`, `property`) || metaTagContent(doc, `og:type`, `name`) ); } function getImages( doc: cheerio.Root, rootUrl: string, imagesPropertyType?: string, ) { let images: string[] = []; let nodes: cheerio.Cheerio | null; let src: string | undefined; let dic: Record = {}; const imagePropertyType = imagesPropertyType ?? `og`; nodes = metaTag(doc, `${imagePropertyType}:image`, `property`) || metaTag(doc, `${imagePropertyType}:image`, `name`); if (nodes) { nodes.each((_: number, node: cheerio.Element) => { if (node.type === `tag`) { src = node.attribs.content; if (src) { src = new URL(src, rootUrl).href; images.push(src); } } }); } if (images.length <= 0 && !imagesPropertyType) { src = doc(`link[rel=image_src]`).attr(`href`); if (src) { src = new URL(src, rootUrl).href; images = [src]; } else { nodes = doc(`img`); if (nodes?.length) { dic = {}; images = []; nodes.each((_: number, node: cheerio.Element) => { if (node.type === `tag`) src = node.attribs.src; if (src && !dic[src]) { dic[src] = true; // width = node.attribs.width; // height = node.attribs.height; images.push(new URL(src, rootUrl).href); } }); } } } return images; } function getVideos(doc: cheerio.Root) { const videos = []; let nodeTypes; let nodeSecureUrls; let nodeType; let nodeSecureUrl; let video; let videoType; let videoSecureUrl; let width; let height; let videoObj; let index; const nodes = metaTag(doc, `og:video`, `property`) || metaTag(doc, `og:video`, `name`); if (nodes?.length) { nodeTypes = metaTag(doc, `og:video:type`, `property`) || metaTag(doc, `og:video:type`, `name`); nodeSecureUrls = metaTag(doc, `og:video:secure_url`, `property`) || metaTag(doc, `og:video:secure_url`, `name`); width = metaTagContent(doc, `og:video:width`, `property`) || metaTagContent(doc, `og:video:width`, `name`); height = metaTagContent(doc, `og:video:height`, `property`) || metaTagContent(doc, `og:video:height`, `name`); for (index = 0; index < nodes.length; index += 1) { const node = nodes[index]; if (node.type === `tag`) video = node.attribs.content; nodeType = nodeTypes?.[index]; if (nodeType?.type === `tag`) { videoType = nodeType ? nodeType.attribs.content : null; } nodeSecureUrl = nodeSecureUrls?.[index]; if (nodeSecureUrl?.type === `tag`) { videoSecureUrl = nodeSecureUrl ? nodeSecureUrl.attribs.content : null; } videoObj = { url: video, secureUrl: videoSecureUrl, type: videoType, width, height, }; if (videoType && videoType.indexOf(`video/`) === 0) { videos.splice(0, 0, videoObj); } else { videos.push(videoObj); } } } return videos; } // returns default favicon (//hostname/favicon.ico) for a url function getDefaultFavicon(rootUrl: string): string { return new URL(`/favicon.ico`, rootUrl).href; } // returns an array of URLs to favicon images function getFavicons(doc: cheerio.Root, rootUrl: string): string[] { const images = []; let nodes: cheerio.Cheerio | never[] = []; let src: string | undefined; const relSelectors = [ `rel=icon`, `rel="shortcut icon"`, `rel=apple-touch-icon`, ]; relSelectors.forEach((relSelector) => { // look for all icon tags nodes = doc(`link[${relSelector}]`); // collect all images from icon tags if (nodes.length) { nodes.each((_: number, node: cheerio.Element) => { if (node.type === `tag`) src = node.attribs.href; if (src) { src = new URL(src, rootUrl).href; images.push(src); } }); } }); // if no icon images, use default favicon location if (images.length <= 0) { images.push(getDefaultFavicon(rootUrl)); } return images; } function parseImageResponse(url: string, contentType: string) { return { url, mediaType: `image`, contentType, favicons: [getDefaultFavicon(url)], }; } function parseAudioResponse(url: string, contentType: string) { return { url, mediaType: `audio`, contentType, favicons: [getDefaultFavicon(url)], }; } function parseVideoResponse(url: string, contentType: string) { return { url, mediaType: `video`, contentType, favicons: [getDefaultFavicon(url)], }; } function parseApplicationResponse(url: string, contentType: string) { return { url, mediaType: `application`, contentType, favicons: [getDefaultFavicon(url)], }; } function parseTextResponse( body: string, url: string, options: ILinkPreviewOptions = {}, contentType?: string, ): ILinkPreviewResponse { const doc = cheerio.load(body); let response = { url, title: getTitle(doc), siteName: getSiteName(doc), description: getDescription(doc), author: getAuthor(doc), mediaType: getMediaType(doc) || `website`, contentType, images: getImages(doc, url, options.imagesPropertyType), videos: getVideos(doc), favicons: getFavicons(doc, url), }; if (options?.onResponse && typeof options.onResponse !== `function`) { throw new Error(`link-preview-js onResponse option must be a function`); } if (options?.onResponse) { // send in a cloned response (to avoid mutation of original response reference) const clonedResponse = structuredClone(response); const urlObject = new URL(url); response = options.onResponse(clonedResponse, doc, urlObject); } return response; } function parseUnknownResponse( body: string, url: string, options: ILinkPreviewOptions = {}, contentType?: string, ) { return parseTextResponse(body, url, options, contentType); } function parseResponse( response: IPreFetchedResource, options?: ILinkPreviewOptions, ) { try { // console.log("[link-preview-js] response", response); let contentType = response.headers[`content-type`]; let contentTypeTokens: string[] = []; let charset = null; if (!contentType) { return parseUnknownResponse(response.data, response.url, options); } if (contentType.includes(`;`)) { contentTypeTokens = contentType.split(`;`); contentType = contentTypeTokens[0]; for (let token of contentTypeTokens) { if (token.indexOf("charset=") !== -1) { charset = token.split("=")[1]; } } } // parse response depending on content type if (CONSTANTS.REGEX_CONTENT_TYPE_IMAGE.test(contentType)) { return { ...parseImageResponse(response.url, contentType), charset }; } if (CONSTANTS.REGEX_CONTENT_TYPE_AUDIO.test(contentType)) { return { ...parseAudioResponse(response.url, contentType), charset }; } if (CONSTANTS.REGEX_CONTENT_TYPE_VIDEO.test(contentType)) { return { ...parseVideoResponse(response.url, contentType), charset }; } if (CONSTANTS.REGEX_CONTENT_TYPE_TEXT.test(contentType)) { return { ...parseTextResponse(response.data, response.url, options, contentType), charset, }; } if (CONSTANTS.REGEX_CONTENT_TYPE_APPLICATION.test(contentType)) { return { ...parseApplicationResponse(response.url, contentType), charset, }; } const htmlString = response.data; return { ...parseUnknownResponse(htmlString, response.url, options), charset, }; } catch (e) { throw new Error( `link-preview-js could not fetch link information ${( e as any ).toString()}`, ); } } /** * Parses the text, extracts the first link it finds and does a HTTP request * to fetch the website content, afterwards it tries to parse the internal HTML * and extract the information via meta tags * @param text string, text to be parsed * @param options ILinkPreviewOptions */ export async function getLinkPreview( text: string, options?: ILinkPreviewOptions, ) { if (!text || typeof text !== `string`) { throw new Error(`link-preview-js did not receive a valid url or text`); } const detectedUrl = text .replace(/\n/g, ` `) .split(` `) .find((token) => CONSTANTS.REGEX_VALID_URL.test(token)); if (!detectedUrl) { throw new Error(`link-preview-js did not receive a valid a url or text`); } if (options?.followRedirects === `manual` && !options?.handleRedirects) { throw new Error( `link-preview-js followRedirects is set to manual, but no handleRedirects function was provided`, ); } if (!!options?.resolveDNSHost) { const resolvedUrl = await options.resolveDNSHost(detectedUrl); throwOnLoopback(resolvedUrl); } const timeout = options?.timeout ?? 3000; // 3 second timeout default const controller = new AbortController(); const timeoutCounter = setTimeout(() => controller.abort(), timeout); const fetchOptions = { headers: options?.headers ?? {}, redirect: options?.followRedirects ?? `error`, signal: controller.signal, }; const fetchUrl = options?.proxyUrl ? options.proxyUrl.concat(detectedUrl) : detectedUrl; let response = await fetch(fetchUrl, fetchOptions).catch((e) => { if (e.name === `AbortError`) { throw new Error(`Request timeout`); } clearTimeout(timeoutCounter); throw e; }); if ( response.status > 300 && response.status < 309 && fetchOptions.redirect === `manual` && options?.handleRedirects ) { const locationHeader = response.headers.get(`location`) || ``; const isAbsoluteURI = locationHeader.startsWith("http://") || locationHeader.startsWith("https://"); // Resolve the URL, handling both absolute and relative URLs const forwardedUrl = isAbsoluteURI ? locationHeader : new URL(locationHeader, fetchUrl).href; if (!options.handleRedirects(fetchUrl, forwardedUrl)) { throw new Error(`link-preview-js could not handle redirect`); } if (!!options?.resolveDNSHost) { const resolvedUrl = await options.resolveDNSHost(forwardedUrl); throwOnLoopback(resolvedUrl); } response = await fetch(forwardedUrl, fetchOptions as any); } clearTimeout(timeoutCounter); const headers: Record = {}; response.headers.forEach((header, key) => { headers[key] = header; }); const normalizedResponse: IPreFetchedResource = { url: options?.proxyUrl ? response.url.replace(options.proxyUrl, ``) : response.url, headers, data: await response.text(), }; return parseResponse(normalizedResponse, options); } /** * Skip the library fetching the website for you, instead pass a response object * from whatever source you get and use the internal parsing of the HTML to return * the necessary information * @param response Preview Response * @param options IPreviewLinkOptions */ export async function getPreviewFromContent( response: IPreFetchedResource, options?: ILinkPreviewOptions, ) { if (!response || typeof response !== `object`) { throw new Error(`link-preview-js did not receive a valid response object`); } if (!response.url) { throw new Error(`link-preview-js did not receive a valid response object`); } return parseResponse(response, options); } ================================================ FILE: jest.config.js ================================================ module.exports = { preset: `ts-jest`, testEnvironment: `node`, globals: { "ts-jest": { diagnostics: false } }, testPathIgnorePatterns: ["/build/"], }; ================================================ FILE: mise.toml ================================================ [tools] node = "24" ================================================ FILE: package.json ================================================ { "name": "link-preview-js", "version": "3.0.6", "description": "Javascript module to extract and fetch HTTP link information from blocks of text.", "main": "build/index.js", "exports": { ".": { "require": "./build/index.js", "import": "./build/index.js" }, "./package.json": "./package.json" }, "types": "build/index.d.ts", "scripts": { "test": "jest", "test:ci": "jest --testLocationInResults --ci --outputFile=test_results.json --json", "build": "tsc", "bump": "./bump-version.sh", "prepublishOnly": "tsc" }, "keywords": [ "javascript", "link", "url", "http", "preview" ], "author": "Oscar Franco", "email": "ospfranco@protonmail.com", "license": "MIT", "repository": "https://github.com/ospfranco/link-preview-js", "dependencies": { "cheerio": "1.0.0-rc.11" }, "files": [ "build" ], "devDependencies": { "@skypack/package-check": "^0.2.2", "@types/cheerio": "0.22.24", "@types/jest": "^28.1.4", "jest": "^28.1.2", "prettier": "2.7.1", "ts-jest": "^28.0.5", "typescript": "^4.7.4" }, "packageManager": "yarn@1.22.22+sha1.ac34549e6aa8e7ead463a7407e1c7390f61a6610", "engines": { "node": ">=18" } } ================================================ FILE: tsconfig.json ================================================ { "compilerOptions": { "outDir": "build", "allowSyntheticDefaultImports": true, "esModuleInterop": true, "moduleResolution": "node", "declaration": true, "target": "es5", "skipLibCheck": true, "strict": true, "resolveJsonModule": true }, "exclude": ["node_modules", "build"] }