feat(functions): add extractTextFromHTML (#1)

Initially it was planned to use JSDOM NPM package for `extractTextFromHTML`
function, however placement of JSDOM in the dependancy system prompted
significant contemplation:
- placement of JSDOM into production dependancies makes this package 3MB bigger,
which is just plain out horrible considering JSDOM has use in just one function;
- placement of JSDOM into optionalDependancies makes it possible not to install
it (however only if specifically you tell the CLI not to, essentially making it
almost no different from production dependancies in terms of size as noone
ever would spend time carefully researching an NPM package documentation just
to tell whether you need to use a very specific option or no) and adding
additional 0.4s - 0.5s slowdown to the function due to dynamic import;
- lastly, placement of JSDOM into peerDependancies simply has no use in this
case as logic of peerDependancies is much more complex than it would be
necessary in this case.

Instead I chose to add another argument to the function named `domParser`
where you are meant to provide DOMParser of your choise, that way if
@resultium/utils are used in the browser there is no additional load time or
slowdowns due to usage of browser default DOMParser and on the server you are
able to define your own parser, whether it be JSDOM, cheerio, puppeteer or
whatever you please.

However JSDOM has been added as a devDependancy in order to make tests possible,
as they run on the server side.

Resolves #1
This commit is contained in:
2024-05-19 16:37:58 +03:00
parent cafaad76aa
commit 2305e3505a
5 changed files with 2591 additions and 1819 deletions

View File

@@ -32,18 +32,20 @@
"license": "GPL-3.0-or-later",
"devDependencies": {
"@jest/globals": "^29.7.0",
"@types/node": "^20.11.25",
"@typescript-eslint/eslint-plugin": "^7.2.0",
"@typescript-eslint/parser": "^7.2.0",
"@types/jsdom": "^21.1.6",
"@types/node": "^20.12.12",
"@typescript-eslint/eslint-plugin": "^7.9.0",
"@typescript-eslint/parser": "^7.9.0",
"eslint": "^8.57.0",
"eslint-config-prettier": "^9.1.0",
"eslint-plugin-prefer-arrow": "^1.2.3",
"eslint-plugin-security": "^2.1.1",
"jest": "^29.7.0",
"jsdom": "^24.0.0",
"prettier": "^3.2.5",
"ts-jest": "^29.1.2",
"typedoc": "^0.25.12",
"typedoc": "^0.25.13",
"typedoc-plugin-markdown": "^3.17.1",
"typescript": "^5.4.2"
"typescript": "^5.4.5"
}
}

4323
pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,63 @@
/**
* Copyright 2024 Resultium LLC
*
* This file is part of @resultium/utils.
*
* @resultium/utils is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* any later version.
*
* @resultium/utils is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with RCZ. If not, see <https://www.gnu.org/licenses/>.
*/
/**
* Extracts text content from an HTML string
*
* @remarks
* By default this function uses `new window.DOMParser()` to parse HTML, however
* on the server you might need to use alternative DOM Parser such as JSDOM,
* puppeteer or cheerio as `window` object is not defined in NodeJS
*
* @see {@link https://stackoverflow.com/q/32723111/14544732 | Why doesn't
* Node.js have a native DOM?}
* for more information on why this function is made the way it is
*
* @example
* Here's how you would use this function in browser
* ```ts
* let HTML = '<p><a>Lorem ipsum</a> dolor sit</p>'
*
* // prints "Lorem ipsum dolor sit"
* console.log(extractTextFromHTML(HTML))
* ```
*
* @example
* Here's how you would use this function on server
* ```ts
* import { JSDOM } from "jsdom";
* const domParser = new new JSDOM().window.DOMParser();
*
* let HTML = '<p><a>Lorem ipsum</a> dolor sit</p>'
*
* // prints "Lorem ipsum dolor sit"
* console.log(extractTextFromHTML(HTML, domParser))
* ```
*
* @param html - HTML to extract text from
* @param domParser - DOMParser to use
*/
export const extractTextFromHTML = (
html: string,
domParser: DOMParser = new window.DOMParser(),
) => {
const document = domParser.parseFromString(html, "text/html");
return document.body.textContent;
};

View File

@@ -20,6 +20,7 @@
export * from "./functions/capitalizeFirstChar";
export * from "./functions/classNames";
export * from "./functions/conditionalJoin";
export * from "./functions/extractTextFromHTML";
export * from "./functions/isDeepWeakEqual";
export * from "./functions/randomInRange";
export * from "./functions/removeFromArrayByKeyValue";

View File

@@ -0,0 +1,11 @@
import { expect, test } from "@jest/globals";
import { extractTextFromHTML } from "../src";
import { JSDOM } from "jsdom";
const domParser = new new JSDOM().window.DOMParser();
test("extracts text content from an HTML string", async () => {
let HTML = "<p><a>Lorem ipsum</a> dolor sit</p>";
expect(extractTextFromHTML(HTML, domParser)).toBe("Lorem ipsum dolor sit");
});