抓取Google新闻



i m试图刮擦Google新闻,但我使用的代码有问题:

 var express = require('express');
var request = require('request');
var cheerio = require('cheerio');
var path = require('path');
var fs =require('fs');
var app = express();
var port = 8080;

 // Define the requests url
 var url = "https://news.google.com/news";
request(url,function(err,res,body){
var news=[];
 var $page = cheerio.load(body);
var $url=$page('table[class="esc-layout-table"]').find('tbody > tr > td > div > h2 > a').each(function (index, element) {
  news.push($page(element).attr('href'));
});
news={
//desc:$desc,
    url:$url,
//    img:$img,
};

console.log ('success ....'news);

});

我想链接到文章和标题以及缩略图。我会存储在firebase数据库中但不幸的是我在其余的我的安装登录中迷路了我无法获得我期望的结果,而是在控制台中得到它:

    success .... { '0':
   { type: 'tag',
     name: 'a',
     attribs:
      { target: '_blank',
        class: 'article usg-AFQjCNFxodYTzKo8-hM57511iQgBdfn8xA did-3230940966728164415',
        href: 'https://www.washingtonpost.com/news/post-nation/wp/2016/12/15/jurors-begin-deliberating-in-charleston-church-shooting-trial/',
        url: 'https://www.washingtonpost.com/news/post-nation/wp/2016/12/15/jurors-begin-deliberating-in-charleston-church-shooting-trial/',
        id: 'MAA4AEgAUABgAWoCdXM',
        ssid: 'h' },
     children: [ [Object] ],
     next: null,
     prev: null,
     parent:
      { type: 'tag',
        name: 'h2',
        attribs: [Object],
        children: [Object],
        next: null,
        prev: null,
        parent: [Object] } },
  '1':
   { type: 'tag',
     name: 'a',
     attribs:
      { target: '_blank',
        class: 'article usg-AFQjCNEQY4otecPJJevDyoBp3K-IQnes2w did-141563424311867977',
        href: 'http://www.businessinsider.com/facebook-will-fact-check-label-fake-news-in-news-feed-2016-12',
        url: 'http://www.businessinsider.com/facebook-will-fact-check-label-fake-news-in-news-feed-2016-12',
        id: 'MAA4AEgBUABgAWoCdXM',
        ssid: 'h' },
     children: [ [Object] ],
     next: null,
     prev: null,
     parent:
      { type: 'tag',
        name: 'h2',
        attribs: [Object],
        children: [Object],
        next: null,
        prev: null,
        parent: [Object] } },
  '2':
   { type: 'tag',
     name: 'a',
     attribs:
      { target: '_blank',
        class: 'article usg-AFQjCNHyGG4zl4RW-AoIILTssJX_TKCybg did--2293954291931624250',
        href: 'http://www.bbc.com/news/world-middle-east-38329461',
        url: 'http://www.bbc.com/news/world-middle-east-38329461',
        id: 'MAA4AEgCUABgAWoCdXM',
        ssid: 'h' },
     children: [ [Object] ],
     next: null,
     prev: null,
     parent:
      { type: 'tag',
        name: 'h2',
        attribs: [Object],
        children: [Object],
        next: null,
        prev: null,
        parent: [Object] } },
  '3':

注意:如果您运行代码,这只是结果,您会清楚地看到我的意思感谢您的建议。

不确定这是您想要的,但是尝试一下,我像您想要的所有URL

一样
 var express = require('express');
var request = require('request');
var cheerio = require('cheerio');
var path = require('path');
var fs =require('fs');
var app = express();
var port = 8080;

 // Define the requests url
 var url = "https://news.google.com/news";
request(url,function(err,res,body){
var news=[];
var $= cheerio.load(body);
$('table[class="esc-layout-table"]').each(function () {
  var image = $(this).find('tbody > tr > .esc-layout-thumbnail-cell > .esc-thumbnail-wrapper > .esc-thumbnail-state > .esc-thumbnail > a > .esc-thumbnail-image-wrapper > img').attr('src');
  var title = $(this).find('tbody > tr > .esc-layout-article-cell > .esc-lead-article-title-wrapper > .esc-lead-article-title > a > span').text();
  var url = $(this).find('tbody > tr > .esc-layout-article-cell > .esc-lead-article-title-wrapper > .esc-lead-article-title > a').attr('href');
  news.push({ url, title, image });
});

console.log ('success ....', news);

});

结果

success .... [ { url: 'http://www.cbc.ca/news/canada/saskatchewan/prisoner-killed-sask-penitentiary-1.3898175',
    title: '1 dead, at least 8 injured in Saskatchewan prison riot',
    image: '//t2.gstatic.com/images?q=tbn:ANd9GcRKhvczSgL4g3dO8EHkruAEB5AoqkI-PvbB8LzlHBZTPGJYh4bEooNKApDXqTzboNrLpqv3H7MG' },
  { url: 'http://www.cbc.ca/news/world/aleppo-convoy-evacuation-1.3895602',
    title: '3000 people evacuated from eastern Aleppo so far as fragile ceasefire holds',
    image: '//t2.gstatic.com/images?q=tbn:ANd9GcQ86WfdQJVFE4GrQvu_CPrjx3sqhqut0gjBRv6opfVA4JLIqsWeBDjRWURFGd7h_XN_0D0DnoQ5' },
  { url: 'http://www.cp24.com/news/five-family-members-dead-following-first-nation-fire-1.3204960',
    title: 'Five family members dead following First Nation fire',
    image: '//t1.gstatic.com/images?q=tbn:ANd9GcS6DkmtVrLs4wzVLDfNZvfOm9Js6rXvSg8ttjdoofJwWUZkM2wSjvLA-HpVZdTJN7pG-1FubXI' },
  { url: 'http://www.ctvnews.ca/business/b-c-offers-five-year-interest-free-down-payment-loans-to-first-time-buyers-1.3205119',
    title: 'BC offers five-year, interest-free down-payment loans to first-time buyers',
    image: '//t3.gstatic.com/images?q=tbn:ANd9GcR2gUEW4E0gtt5Sj-jFIJP0iC1JIIZ3qi5RbpbwD7otN7B5nKf8qXT-Q1Aaxcs5Z7FVn-LhNXU5' },

希望可以帮助您:)

除了Equimper的答案外,您还可以从Google有机新闻结果中提取数据。在线IDE中的完整代码

const cheerio = require("cheerio");
const axios = require("axios");
const searchString = "elon musk";                   // what we want to search
const encodedString = encodeURI(searchString);      // what we want to search for in URI encoding
const AXIOS_OPTIONS = {
    headers: {
        "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36",
    },                                                  // adding the User-Agent header as one way to prevent the request from being blocked
    params: {
        q: encodedString,                               // our encoded search string        
        tbm: "nws",                                     // parameter defines the type of search you want to do ("nws" means news)
        hl: 'en',                                       // Parameter defines the language to use for the Google search
        gl: 'us'                                        // parameter defines the country to use for the Google search
    },
};
function getNewsInfo() {
    return axios
        .get(`http://google.com/search`, AXIOS_OPTIONS)
        .then(function ({ data }) {
            let $ = cheerio.load(data);
            const pattern = /s='(?<img>[^']+)';w+sw+=['(?<id>w+_d+)'];/gm; //https://regex101.com/r/pMd0yx/1
            const images = [...data.matchAll(pattern)].map(({ groups }) => ({ id: groups.id, img: groups.img.replace('\x3d', '') }))
            const allNewsInfo = Array.from($('.WlydOe')).map((el) => {
                return {
                    link: $(el).attr('href'),
                    source: $(el).find('.CEMjEf span').text().trim(),
                    title: $(el).find('.mCBkyc').text().trim().replace('n', ''),
                    snippet: $(el).find('.GI74Re').text().trim().replace('n', ''),
                    image: images.find(({ id, img }) => id === $(el).find('.uhHOwf img').attr('id'))?.img || "No image",
                    date: $(el).find('.ZE0LJd span').text().trim(),
                }
            });
            return allNewsInfo;
        });
}
getNewsInfo().then(console.log);

输出:

[
   {
      "link":"https://www.newyorker.com/news/q-and-a/why-elon-musk-bought-twitter",
      "source":"The New Yorker",
      "title":"Why Elon Musk Bought Twitter",
      "snippet":"Portrait of Elon Musk looking off to the side. Musk, the C.E.O. of Tesla, has previously had some run-ins with the S.E.C.Source photograph by...",
      "image":"",
      "date":"2 weeks ago"
   }, ... other results
]

另外,您可以使用Serpapi的Google新闻结果API。Serpapi是免费的API,每月进行100次搜索以进行测试。如果您需要更多搜索,则有付费计划。

不同之处在于,要完成的所有操作都是在结构化的JSON上迭代,而不是从头开始编码所有内容,维护,选择正确的选择器以提取正确的数据,并弄清楚如何绕过Google或其他搜索。引擎。查看操场。

要集成的示例代码:

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);        //your API key from serpapi.com
const searchString = "elon musk";                        // what we want to search
const params = {
  engine: "google",                                     // search engine
  q: searchString,                                      // search query
  google_domain: "google.com",                          // google domain: google.com, google.de, google.fr
  gl: "us",                                             // parameter defines the country to use for the Google search
  hl: "en",                                             // Parameter defines the language to use for the Google search
  tbm: "nws"                                            // parameter defines the type of search you want to do ("nws" means news)
};
const getNewsData = function ({ news_results }) {
  return news_results.map((result) => {
    const { link, title, source, date, snippet, thumbnail: image = "No image" } = result;
    return {
      link,
      source,
      title: title.replace('n', ''),
      snippet: snippet.replace('n', ''),
      image,
      date,
    }
  })
};
const getJson = (params) => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  })
}
getJson(params).then(getNewsData).then(console.log)

输出:

[
   {
      "link":"https://nypost.com/2022/05/13/elon-musk-backs-gop-bid-to-strip-disney-of-mickey-mouse-copyright/",
      "source":"New York Post",
      "title":"Elon Musk backs GOP bid to strip Disney of Mickey Mouse copyright",
      "snippet":"Elon Musk appeared to voice his support for a Republican senator's efforts n""+""to strip The Walt Disney Co. of its copyright of Mickey Mouse.",
      "image":"https://serpapi.com/searches/627e67d93c3fb22215607d9e/images/22b0f5e214e9045c6dc1c6c683cc0b1468248a0cb118e82ed3c7f8900a359195.jpeg",
      "date":"17 mins ago"
   }, ... other results
]

如果您想了解有关刮擦Google的更多信息,则可以查看我的博客文章如何用Node.js

刮擦Google News

免责声明,我为serpapi工作

最新更新