我有兴趣收集/抓取有关Chrome Webstore中可用的流行扩展所获得的评论的数据。
特别地,我需要检索为特定扩展留下的总评论数,然后检索此插件公开可用的所有评论。我的问题是:我不能写一个标准的PHP Curl scraper,因为我感兴趣的数据可以通过json请求获得,特别是,我需要调用:
- https://chrome.google.com/reviews/components为编号评论('numRatings')
- https://chrome.google.com/reviews/json/search评论("评论")
我试着写这个:
<script src="http://code.jquery.com/jquery-latest.js"></script>
<script type="text/javascript">
function getReviews(extensionId, callback) {
var entities = [{'url' : 'http://chrome.google.com/extensions/permalink?id=' + extensionId}];
var param = {"searchSpecs":[{"requireComment":true,"entities": entities,"groups":["public_comment"],"matchExtraGroups":true,"sortBy":"quality","startIndex":10,"numResults":10,"includeNickNames":true}],"applicationId":94};
$.ajax({
type: 'POST',
url: 'https://chrome.google.com/reviews/json/search',
contentType: 'application/xml',
xhrFields: {withCredentials: true },
dataType: 'json',
data: 'req=' + JSON.stringify(param) + '&requestSource=widget'
}).success(callback);
}
</script>
<script type="text/javascript">
$(document).ready(getReviews('gighmmpiobklfepjocnamgkkbiglidom', function(reviews) { console.log(reviews); }));
</script>
我不是很热衷于jQuery/JSON(-P),上面的代码肯定是错误的。
我的问题如下:
- 如何绕过同域策略?我试过YQL没有成功…
- 如何格式化我的url/'data'只检索的数字评论('numRatings')上chrome.google.com/reviews/components和评论('评论')上chrome.google.com/reviews/json/search为其id标识的特定扩展,例如ghympiobklfepjocnamgkkbiglidom ?
我已经使用PHP完成了对流行的Mozilla Addons的这种抓取,并使用标准的curl/XPath收集了我需要的数据。
谢谢你的帮助!
1)最简单的方法是创建一个Chrome扩展;
2)参见https://github.com/xpressyoo/MyExtensions[...]
getComments : function() {
var entities = [];
//each(Ext.extensions, function(data, id) {
entities.push({'url' : 'http://chrome.google.com/extensions/permalink?id=' + this.hash});
//});
Ext.XHR['comments'] = new Ajax({
'method' : 'POST',
'encodeURI' : false, // Needed
'url' : 'https://chrome.google.com/reviews/json/search',
'headers' : {
'Content-type' : 'application/xml'
},
'parameters' : {
'req' : JSON.stringify({'searchSpecs' : [{'entities' : entities, 'groups' : ['public_comment'], 'matchExtraGroups' : true,"sortBy":"quality", 'startIndex' : 0, 'numResults' : 80, 'includeNickNames' : true}], 'applicationId' : 94 }) + '&requestSource=widget'
},
'onSuccess' : function(xhr) {
var json = xhr.responseJSON;
if(json && json.searchResults ) {
this.comments = {
'total' : Number(json.searchResults[0].numAnnotations.toString().replace(/,/, '').toInt()),
'latest' : json.searchResults[0].annotations ? json.searchResults[0].annotations[0] :{},
'previous' : this.comments.total || null,
'latestPrevious' : $merge(this.comments.latest) || null,
'new' : this.comments['new'] || false
}
Ext.XHR['comments'] = null;
}
}.bind(this)
}).send();
return this;
},
[...]
和
var nbreviews = this.comments.total; //The number of reviews
var latestcomment = (this.comments.latest0 && this.comments.latest0.comment ? this.comments.latest0.comment.replace(/n/gi, '') : '');// get the latest comment
var nthcomment = (this.comments.latestn && this.comments.latestn.comment ? this.comments.latestn.comment.replace(/n/gi, '') : '');//Get the nth comment
地点:
'latestn' : json.searchResults[0].annotations ? json.searchResults[0].annotations[n] :{},
这是一种在PHP中使用并行cURL的方法。这个脚本刮掉Chrome网络商店中所有的扩展(按受欢迎程度排名),并检索信息,如:
- 用户数量
- 星级数
- 文本评审数
- 每个文本审查的字符数(每个扩展最多100个审查)
//GET URL
$url0 = "https://chrome.google.com/";
//AUTO LOOP
foreach(range(0, 705, 5) as $x) {
//Nb PAGES TO DOWNLOAD
$frompge = $x+1;
$topge = $x+5;
$nbpages = ($topge - $frompge)+1;
$zitems = $nbpages*20;
//MULTI cURL INIT
$mh = curl_multi_init();
$running = null;
//GENERATE URLs ARRAY
$urls = array();
for ($a = $frompge; $a <= $topge; $a++){
$aa = $url0 . 'webstore/list/most_popular/'. $a .'?category=ext';
$urls[] = $aa;
}
foreach ($urls as $name => $url)
{
$c[$name]=curl_init($url);
curl_setopt($c[$name], CURLOPT_HEADER, false);
curl_setopt($c[$name], CURLOPT_FAILONERROR, true);
curl_setopt($c[$name], CURLOPT_FOLLOWLOCATION, true);
curl_setopt($c[$name], CURLOPT_AUTOREFERER, true);
curl_setopt($c[$name], CURLOPT_RETURNTRANSFER, true);
curl_setopt($c[$name], CURLOPT_TIMEOUT, 10);
curl_multi_add_handle ($mh,$c[$name]);
}
// execute all queries simultaneously, and continue when all are complete
do {
curl_multi_exec($mh, $running);
} while ($running >0);
$html = array();
foreach ($urls as $name => $url)
{
$html[]=curl_multi_getcontent($c[$name]);
curl_multi_remove_handle($mh,$c[$name]);
curl_close($c[$name]);
}
curl_multi_close($mh);
for ($b = 0; $b <= $nbpages-1; $b++) {
// Parse the HTML information and return the results.
$dom = new DOMDocument();
@$dom->loadHtml($html[$b]);
$xpath = new DOMXPath($dom);
$links = $xpath->query("//a[contains(@class, 'title-a')]");
$result = array();
foreach ( $links as $item ) {
$newDom = new DOMDocument;
$newDom->appendChild($newDom->importNode($item,true));
$xpath = new DOMXPath( $newDom );
$cleaner = array(" users", " user", "(", ")", ","," ");
$data = str_replace($cleaner,"",trim($xpath->query("//script")->item(0)->nodeValue));
list($b1,$id,$b2,$b3,$b4,$name,$b5,$b6,$b7,$b8,$b9,$b10,$b11,$b12,$b13,$nbusers) = explode(""", $data);
$label = str_replace(" ", "", strtolower(ereg_replace("[^A-Za-z0-9 ]", "", $name)));
//CATEGORIES (based on nb of users)
if($nbusers<100){$category = '1';$color = 'inherit';}
else if($nbusers>=100 && $nbusers<1000){$category = '2';$color = '#E6EEEE';}
else if($nbusers>=1000 && $nbusers<10000){$category = '3';$color = '#CDDEDE';}
else if($nbusers>=10000 && $nbusers<100000){$category = '4';$color = '#B5CDCD';}
else if($nbusers>=100000 && $nbusers<1000000){$category = '5';$color = '#9CBDBD';}
else if($nbusers == '1000000+'){$category = '6';$color = '#83ACAC';}
else{$category = '-9';}
/////////////////////////////////////////////LOOP REVIEWS
$extURL = 'http://chrome.google.com/extensions/permalink?id='.$id;
$c1 = curl_init('https://chrome.google.com/reviews/json/search');
$c1a = curl_init('https://chrome.google.com/reviews/json/search');
$c2 = curl_init('https://chrome.google.com/reviews/json/lookup');
$fields1 = http_build_query(array(
'req' => '{"searchSpecs":[{"requireComment":true,"entities":[{"url":"'.$extURL.'"}],"groups":["public_comment"],"matchExtraGroups":true,"sortBy":"quality","startIndex":0,"numResults":100,"includeNickNames":false}],"applicationId":94}',
));
$options1 = array(
CURLOPT_RETURNTRANSFER => true,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_POST => true,
CURLOPT_TIMEOUT => 10,
CURLOPT_POSTFIELDS => $fields1,
);
$fields1a = http_build_query(array(
'req' => '{"searchSpecs":[{"requireComment":true,"entities":[{"url":"'.$extURL.'"}],"groups":["public_comment"],"matchExtraGroups":true,"startIndex":0,"numResults":100,"includeNickNames":false}],"applicationId":94}',
));
$options1a = array(
CURLOPT_RETURNTRANSFER => true,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_POST => true,
CURLOPT_TIMEOUT => 10,
CURLOPT_POSTFIELDS => $fields1a,
);
$fields2 = http_build_query(array(
'req' => '{"entities":[{"url" : "'.$extURL.'", "includeAggregateInfo" : true}],"applicationId":94}',
));
$options2 = array(
CURLOPT_RETURNTRANSFER => true,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_POST => true,
CURLOPT_TIMEOUT => 10,
CURLOPT_POSTFIELDS => $fields2,
);
curl_setopt_array($c1, $options1);
curl_setopt_array($c1a, $options1a);
curl_setopt_array($c2, $options2);
$mh2 = curl_multi_init();
curl_multi_add_handle($mh2,$c1);
curl_multi_add_handle($mh2,$c1a);
curl_multi_add_handle($mh2,$c2);
$active = null;
do {
curl_multi_exec($mh2, $active);
} while ($active >0);
//close the handles$c1 = curl_init('https://chrome.google.com/reviews/json/search');
$json1=curl_multi_getcontent($c1);
$json1a=curl_multi_getcontent($c1a);
$json2=curl_multi_getcontent($c2);
curl_multi_remove_handle($mh2, $c1);
curl_multi_remove_handle($mh2, $c1a);
curl_multi_remove_handle($mh2, $c2);
curl_multi_close($mh2);
$data1 = json_decode(utf8_encode($json1), true);
$data1a = json_decode(utf8_encode($json1a), true);
$data2 = json_decode(utf8_encode($json2), true);
if ($data1['channelHeader']['errorCode']) return;
$nbreviews = $data1['searchResults'][0]['numAnnotations'];
if ($nbreviews > 100){$nbreviews2=100;}
else{$nbreviews2=$nbreviews;}
//Sum strings
$comments = $data1['searchResults'][0]['annotations'];
$sum =0;
foreach($comments as $comment){
$msg = preg_replace('/[nrt]/', ' ', htmlspecialchars($comment['comment']));
$msg = str_replace(">", "", $msg);
$msg = str_replace(" ", "", $msg);
$strlen = strlen($msg);
$sum += $strlen;
}
$add = $sum;
$final = $add/$nbreviews2;
//Sum strings A
if ($data1a['channelHeader']['errorCode']) return;
$nbreviewsa = $data1a['searchResults'][0]['numAnnotations'];
$commentsa = $data1a['searchResults'][0]['annotations'];
$suma =0;
foreach($commentsa as $commenta){
$msga = preg_replace('/[nrt]/', ' ', htmlspecialchars($commenta['comment']));
$msga = str_replace(">", "", $msga);
$msga = str_replace(" ", "", $msga);
$strlena = strlen($msga);
$suma += $strlena;
}
$adda = $suma;
$finala = $adda/$nbreviews2;
//Ratings
if ($data2['channelHeader']['errorCode']) return;
$nbratings = $data2['annotations'][0]['aggregateInfo']['numRatings'];
$nbstars = $data2['annotations'][0]['aggregateInfo']['averageRating'];
$delta = $nbratings - $nbreviews;
$ratio = $nbratings/$nbusers;
$ratio2 = $nbreviews/$nbusers;
////////////////////////////////////////////END LOOP REVIEWS
//PUT VALUES TOGETHER
$result[] = array($name,$label,$id,$category,$nbusers,$nbratings,$nbreviews,$nbreviewsa,$delta,$ratio,$ratio2,$nbstars,$nbreviews2,$add,$final,$adda,$finala);
}//END FOREACH
//print_r($result,false);
//DISPLAY RESULTS
for ($z = 0; $z <= 20; $z++) {
echo "<tr><td class="non">" .$result[$z][0] . "</td><td class="non">" .$result[$z][1] . "</td><td>" .$result[$z][3] . "</td><td>" .$result[$z][4] . "</td><td>" .$result[$z][5] . "</td><td>" .$result[$z][6] . "</td><td>" .$result[$z][7] . "</td><td>" .$result[$z][8] . "</td><td>" .$result[$z][9] . "</td><td>" .$result[$z][10] . "</td><td>" .$result[$z][11] . "</td><td>" .$result[$z][12] . "</td><td>" .$result[$z][13] . "</td><td>" .$result[$z][14] . "</td><td>" .$result[$z][15] . "</td><td>" .$result[$z][16] . "</td></tr>";
ob_flush();
flush();
}
}
}//END FOREACH