Node.js提取标签之间的 html 元素



假设我有一个网站,其中包含 html 源代码,结构如下:

<html>
<head>
....
<table id="xxx">
<tr>
..
</table>

我已经应用了库来关闭所有 html 标签。您能否告诉我哪些库或正则表达式可以从以<table>开头的 html 源代码中提取所有文本......并以</table>结尾

使用节点.js ?

下面是我的代码

console.log('todo list RESTful API server started on: ' + port);

var request = require('request');
var cheerio = require('cheerio');
request('https://fpp.mpfa.org.hk/tc_chi/mpp_list.jsp', function (error, response, body) {
console.log('error:', error); // Print the error if one occurred
console.log('statusCode:', response && response.statusCode); // Print the response status code if a response was received
var sanitizeHtml = require('sanitize-html');
var dirty = body.match(/[(.*)]/).pop();
var clean = sanitizeHtml(dirty, {
allowedTags: [  ],
allowedAttributes: {
},
allowedIframeHostnames: ['www.youtube.com']
});
console.log('body:', clean); // Print the HTML for the Google homepage.  
});

你只需要使用 cheerio 的 API 来获取<table>,然后打印出文本节点。

给定页面的以下 HTML:

<!DOCTYPE html>
<html lang="en">
<head>
<title>Contacts</title>
</head>
<body>
<main>
<h1>Hello</h1>
<section>
<h2>World</h2>
<table>
<tr>
<td>foo</td>
<td>bar</td>
<td>fizz</td>
</tr>
<tr>
<td>buzz</td>
<td>hello</td>
<td>world</td>
</tr>
</table>
</section>
</main>
</body>
</html>

并运行以下代码:

const request = require("request");
const cheerio = require("cheerio");
const URL_TO_PARSE = "http://localhost/my-page.html";
// Make a request to get the HTML of the page
request(URL_TO_PARSE, (err, response, body) => {
if (err) throw new Error("Something went wrong");
// Load the HTML into cheerio's DOM
const $ = cheerio.load(body);
// Print the text nodes of the <table> in the HTML
console.log($("table").text());
});

将产生以下输出:

foo
bar
fizz

buzz
hello
world

然后,您可以根据需要对其进行操作。 Cheerio使用与jQuery非常相似的API。

最新更新