如果找到某个 URL，则使用 FS 写入新文件，如果不再找到该文件，则删除该文件

我正在尝试编写一个脚本，当发现一个新的url时，它会将该url转换为哈希。检查文件是否已经写入，它会忽略它，如果之前不知道，则应该添加它。

needle.get(mainUrl, function(err, res) {
if (err) throw err;
if (res.statusCode == 200 && !err ) {
var $ = cheerio.load(res.body)
var href = $('div div a').each(function(index, element) {
urlList.push($(element).attr("href"))
var url =($(element).attr("href"))
var hash = crypto.createHash('md5').update(url).digest('hex');

fs.writeFile('./directory/otherdirectory' + `${hash}`, url, (err) => {
if (err) throw err;
console.log('Hash created: ' + url + ' saved as ' + hash
});
}
)
}
})

这是我迄今为止所做的，但这只是写入新文件。它不会检查是否已经添加了文件，也不会删除找不到的文件。

所以我想做的是：

我写了一个脚本，可以获取URL的网站
对所有URL进行哈希处理
让FS检查文件是否已经写入，是否已经忽略它
如果以前不知道，请将其作为新文件添加
如果在提取时找不到url，请将其从列表中删除

我认为这可能是一个X/Y问题，因此我仍在等待我的评论的答案。

话虽如此，您可以简单地使用fs.existsSync忽略现有文件，如果返回true，则跳过保存当前文件，否则保存它。要删除不再可用的文件，只需使用fs.readdir获取目录中的所有文件，并使用fs.unlink:删除URL不在响应中的文件

needle.get(mainUrl, (err, res) => {
if (err) throw err;
if (res.statusCode == 200) {
let $ = cheerio.load(res.body);
let hashes = [];                                                      // list of hashes for this website (to be used later to keep only the items that are still available)
$('div div a').each((index, element) => {
let url = $(element).attr("href");
let hash = crypto.createHash('md5').update(url).digest('hex');
hashes.push(hash);                                                 // store the hash of the current url

if (!fs.existsSync('./directory/otherdirectory/' + hash)) {        // if this file doesn't exist (notice the "not operator !" before fs.existsSync)
fs.writeFile('./directory/otherdirectory/' + hash, url, err => { // save it
if (err) throw err;
console.log('Hash created: ' + url + ' saved as ' + hash);
});
}
});
fs.readdir('./directory/otherdirectory', (err, files) => {           // get a list of all the files in the directory
if (err) throw err;
files.forEach(file => {                                            // and for each file
if(!hashes.includes(file)) {                                     // if it was not encountered above (meaning that it doesn't exist in the hashes array)
fs.unlink('./directory/otherdirectory/' + file, err => {       // remove it
if (err) throw err;
});
}
});
});
});

另一种方法：

由于您似乎只想存储url，因此最好的方法是使用一个文件来存储所有url，而不是将每个url存储在自己的文件中。像这样的东西更有效：

needle.get(mainUrl, (err, res) => {
if (err) throw err;
if (res.statusCode == 200) {
let $ = cheerio.load(res.body);
let urls = $('div div a')                                           // get the 'a' elements
.map((index, element) => $(element).attr("href"))                 // map each one into its href attribute
.get();                                                           // and get them as an array

fs.writeFile('./directory/list-of-urls', urls.join('n'), err => {  // then save all the urls encountered in the file 'list-of-urls' (each on its own line, hence the join('n'))
if (err) throw err;
console.log('saved all the urls to the file "list-of-urls"');
});
}
});

这样，当文件每次被覆盖时，旧的URL将被自动删除，新的URL将自动添加。无需检查是否已经遇到url，因为它无论如何都会被重新保存。

如果你想在其他地方获得URL列表，只需读取文件并按'n'进行拆分，如下所示：

fs.readFile('./directory/list-of-urls', 'utf8', (err, data) => {
if (err) throw err;
let urls = data.split('n');
// use urls here
});

相关内容

最新更新

热门标签：