feat(functions): add extractTextFromHTML (#1)
Initially it was planned to use JSDOM NPM package for `extractTextFromHTML` function, however placement of JSDOM in the dependancy system prompted significant contemplation: - placement of JSDOM into production dependancies makes this package 3MB bigger, which is just plain out horrible considering JSDOM has use in just one function; - placement of JSDOM into optionalDependancies makes it possible not to install it (however only if specifically you tell the CLI not to, essentially making it almost no different from production dependancies in terms of size as noone ever would spend time carefully researching an NPM package documentation just to tell whether you need to use a very specific option or no) and adding additional 0.4s - 0.5s slowdown to the function due to dynamic import; - lastly, placement of JSDOM into peerDependancies simply has no use in this case as logic of peerDependancies is much more complex than it would be necessary in this case. Instead I chose to add another argument to the function named `domParser` where you are meant to provide DOMParser of your choise, that way if @resultium/utils are used in the browser there is no additional load time or slowdowns due to usage of browser default DOMParser and on the server you are able to define your own parser, whether it be JSDOM, cheerio, puppeteer or whatever you please. However JSDOM has been added as a devDependancy in order to make tests possible, as they run on the server side. Resolves #1
This commit is contained in:
12
package.json
12
package.json
@@ -32,18 +32,20 @@
|
||||
"license": "GPL-3.0-or-later",
|
||||
"devDependencies": {
|
||||
"@jest/globals": "^29.7.0",
|
||||
"@types/node": "^20.11.25",
|
||||
"@typescript-eslint/eslint-plugin": "^7.2.0",
|
||||
"@typescript-eslint/parser": "^7.2.0",
|
||||
"@types/jsdom": "^21.1.6",
|
||||
"@types/node": "^20.12.12",
|
||||
"@typescript-eslint/eslint-plugin": "^7.9.0",
|
||||
"@typescript-eslint/parser": "^7.9.0",
|
||||
"eslint": "^8.57.0",
|
||||
"eslint-config-prettier": "^9.1.0",
|
||||
"eslint-plugin-prefer-arrow": "^1.2.3",
|
||||
"eslint-plugin-security": "^2.1.1",
|
||||
"jest": "^29.7.0",
|
||||
"jsdom": "^24.0.0",
|
||||
"prettier": "^3.2.5",
|
||||
"ts-jest": "^29.1.2",
|
||||
"typedoc": "^0.25.12",
|
||||
"typedoc": "^0.25.13",
|
||||
"typedoc-plugin-markdown": "^3.17.1",
|
||||
"typescript": "^5.4.2"
|
||||
"typescript": "^5.4.5"
|
||||
}
|
||||
}
|
||||
|
||||
4323
pnpm-lock.yaml
generated
4323
pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
63
src/functions/extractTextFromHTML.ts
Normal file
63
src/functions/extractTextFromHTML.ts
Normal file
@@ -0,0 +1,63 @@
|
||||
/**
|
||||
* Copyright 2024 Resultium LLC
|
||||
*
|
||||
* This file is part of @resultium/utils.
|
||||
*
|
||||
* @resultium/utils is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* any later version.
|
||||
*
|
||||
* @resultium/utils is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with RCZ. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Extracts text content from an HTML string
|
||||
*
|
||||
* @remarks
|
||||
* By default this function uses `new window.DOMParser()` to parse HTML, however
|
||||
* on the server you might need to use alternative DOM Parser such as JSDOM,
|
||||
* puppeteer or cheerio as `window` object is not defined in NodeJS
|
||||
*
|
||||
* @see {@link https://stackoverflow.com/q/32723111/14544732 | Why doesn't
|
||||
* Node.js have a native DOM?}
|
||||
* for more information on why this function is made the way it is
|
||||
*
|
||||
* @example
|
||||
* Here's how you would use this function in browser
|
||||
* ```ts
|
||||
* let HTML = '<p><a>Lorem ipsum</a> dolor sit</p>'
|
||||
*
|
||||
* // prints "Lorem ipsum dolor sit"
|
||||
* console.log(extractTextFromHTML(HTML))
|
||||
* ```
|
||||
*
|
||||
* @example
|
||||
* Here's how you would use this function on server
|
||||
* ```ts
|
||||
* import { JSDOM } from "jsdom";
|
||||
* const domParser = new new JSDOM().window.DOMParser();
|
||||
*
|
||||
* let HTML = '<p><a>Lorem ipsum</a> dolor sit</p>'
|
||||
*
|
||||
* // prints "Lorem ipsum dolor sit"
|
||||
* console.log(extractTextFromHTML(HTML, domParser))
|
||||
* ```
|
||||
*
|
||||
* @param html - HTML to extract text from
|
||||
* @param domParser - DOMParser to use
|
||||
*/
|
||||
export const extractTextFromHTML = (
|
||||
html: string,
|
||||
domParser: DOMParser = new window.DOMParser(),
|
||||
) => {
|
||||
const document = domParser.parseFromString(html, "text/html");
|
||||
|
||||
return document.body.textContent;
|
||||
};
|
||||
@@ -20,6 +20,7 @@
|
||||
export * from "./functions/capitalizeFirstChar";
|
||||
export * from "./functions/classNames";
|
||||
export * from "./functions/conditionalJoin";
|
||||
export * from "./functions/extractTextFromHTML";
|
||||
export * from "./functions/isDeepWeakEqual";
|
||||
export * from "./functions/randomInRange";
|
||||
export * from "./functions/removeFromArrayByKeyValue";
|
||||
|
||||
11
tests/extractTextFromHTML.test.ts
Normal file
11
tests/extractTextFromHTML.test.ts
Normal file
@@ -0,0 +1,11 @@
|
||||
import { expect, test } from "@jest/globals";
|
||||
import { extractTextFromHTML } from "../src";
|
||||
import { JSDOM } from "jsdom";
|
||||
|
||||
const domParser = new new JSDOM().window.DOMParser();
|
||||
|
||||
test("extracts text content from an HTML string", async () => {
|
||||
let HTML = "<p><a>Lorem ipsum</a> dolor sit</p>";
|
||||
|
||||
expect(extractTextFromHTML(HTML, domParser)).toBe("Lorem ipsum dolor sit");
|
||||
});
|
||||
Reference in New Issue
Block a user