我有这个scraper库,如果第一个用户代理返回错误,我想更改我的用户代理,但这个代码不起作用,如果第一用户代理不起作用。我已经发送了第二次尝试,但这永远不会完成,因为onHTML没有触发:
package scraper
import (
"net/http"
"github.com/davecgh/go-spew/spew"
"github.com/gocolly/colly"
)
const (
fbUserAgent = "ua 1"
userAgent = "ua 2"
)
type ScrapeResult struct {
Title string
Description string
SiteName string
URL string
Images []string
}
func Scrape2(url string) (*ScrapeResult, error) {
var (
res *ScrapeResult
scrapeErr error
done = make(chan bool, 1)
c = colly.NewCollector()
)
c.OnError(func(r *colly.Response, err error) {
if ua := r.Request.Headers.Get("User-Agent"); ua == fbUserAgent {
c.Request(
"GET",
url,
nil,
nil,
http.Header{
"User-Agent": []string{userAgent},
"Accept": []string{"*/*"},
},
)
} else {
scrapeErr = err
done <- true
}
})
c.OnHTML("html", func(e *colly.HTMLElement) {
spew.Dump("ON HTML")
res = &ScrapeResult{URL: url}
res.Title = FindTitle(e)
res.Description = FindDescription(e)
res.SiteName = FindSiteName(e)
res.Images = FindImages(e)
done <- true
})
c.Request(
"GET",
url,
nil,
nil,
http.Header{
"User-Agent": []string{fbUserAgent},
"Accept": []string{"*/*"}, // * / *
"Accept-Language": []string{"en-GB,en-US;q=0.9,en;q=0.8"},
"Accept-Encoding": []string{"gzip, deflate, br"},
"Connection": []string{"keep-alive"},
"sec-ch-ua": []string{` Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90`},
},
)
<- done
return res, scrapeErr
}
func FindTitle(e *colly.HTMLElement) string {
if content := e.ChildAttr(`meta[property="og:title"]`, "content"); len(content) > 0 {
return content
}
return ""
}
func FindDescription(e *colly.HTMLElement) string {
if content := e.ChildAttr(`meta[property="og:description"]`, "content"); len(content) > 0 {
return content
}
return ""
}
func FindSiteName(e *colly.HTMLElement) string {
if content := e.ChildAttr(`meta[property="og:site_name"]`, "content"); len(content) > 0 {
return content
}
return ""
}
func FindImages(e *colly.HTMLElement) []string {
images := make([]string, 0)
if content := e.ChildAttr(`meta[property="og:image"]`, "content"); len(content) > 0 {
images = append(images, content)
}
return images
}
如何第二次发出colly请求并触发onHTML?谢谢
您可以设置属性收集器。CheckHead=真实
这样做的目的是确保您首先执行GetHEAD操作来检查连接问题,如果失败,则会重试。
您需要gocolly的/v2才能包含此功能。
https://github.com/gocolly/colly/blob/master/colly.go#L110