一个Google搜索结果页面的源代码



我是一个网页抓取的初学者,最近被介绍了一些基本的requests模块和selenium的Python。

使用requests模块,我试图下载一个简单网页的源代码,当你在谷歌上搜索任何任意关键字时,让我们说"Cats"。下面是它的代码:


import requests
#The keyword I want to search Google For
search_keyword = "Cats" 
#Opening the URL using requests
page = requests.get(r"https://www.google.com/search?q"+search_keyword) 
#Break program if there is an error in opening previous link
page.raise_for_status() 
#obtain the source code for the Google Webpage as a String
source_code = page.text  

现在,出现的问题是,当我观察source_code变量中的字符串时,字符串中的源代码与我在Google Chrome中观察到的Ctrl+U的源代码非常不同。我已经详细地比较了这两者,它们有很大的不同。

为了详细说明,下面是从CHrome源代码中复制的最后七行:

}catch(e){_._DumpException(e)}
try{
var uj=function(a,b,c){_.Kd.log(46,{att:a,max:b,url:c})},wj=function(a,b,c){_.Kd.log(47,{att:a,max:b,url:c});a<b?vj(a+1,b):_.J.log(Error("aa`"+a+"`"+b),{url:c})},vj=function(a,b){if(xj){const d=_.ke("SCRIPT");d.async=!0;d.type="text/javascript";d.charset="UTF-8";var c=d;c.src=_.Ac(xj);_.tj(c);d.onload=_.be(uj,a,b,d.src);d.onerror=_.be(wj,a,b,d.src);_.Kd.log(45,{att:a,max:b,url:d.src});_.ee("HEAD")[0].appendChild(d)}},yj=class extends _.H{constructor(a){super(a)}},zj=_.F(_.Fd,yj,17)||new yj,Aj,xj=
(Aj=_.F(zj,_.ic,1))?_.qj(_.C(Aj,4)||""):null,Bj,Cj=(Bj=_.F(zj,_.ic,2))?_.qj(_.C(Bj,4)||""):null,Dj=function(){vj(1,2);if(Cj){const a=_.ke("LINK");a.setAttribute("type","text/css");_.pj(a,Cj,"stylesheet");let b=_.$c();b&&a.setAttribute("nonce",b);_.ee("HEAD")[0].appendChild(a)}};(function(){const a=_.Gd();if(_.E(a,18))Dj();else{const b=_.C(a,19)||0;window.addEventListener("load",()=>{window.setTimeout(Dj,b)})}})();
}catch(e){_._DumpException(e)}
})(this.gbar_);
// Google Inc.
;});})();(function(){google.drty&&google.drty(undefined,true);})();});</script><div></div><div jscontroller="MTV2Lb" style="display:none" src="/uviewer?q=cats&amp;origin=https%3A%2F%2Fwww.google.com" id="Rvx4kc" jsaction="rcuQ6b:npT2md;u0pjoe:Hq0NGf"></div><div jscontroller="W0N1pf" id="DDeXhf" jsaction="u0pjoe:Hq0NGf"></div><div id="lfootercc"><script nonce="aZ5SspHCLpIp5NbwBB_sog">(function(){for(var i in google.iir||{}){_setImagesSrc([i],google.iir[i]);}google.iir={};})();(function(){var _jnu='https://www.google.com/ads/measurement/t';var _jnuid=2500;google.x({'id':'jnu'},function(){var bt = google.sx && google.sx.setTimeout || window.setTimeout;bt(function(){google.log('','',_jnu);},_jnuid);});})();google.jslm=3;</script><div id="reviewDialog" data-async-context="async_id_prefix:" data-jiis="up" data-async-type="reviewDialog" data-async-context-required="async_id_prefix" class="yp"></div><div id="dbg_"></div></div></body></html>

下面是存储在source_code中的字符串的最后七行:

else top.location='/doodles/';};})();</script><input value="AK50M_UAAAAAY717njRHtGFQobKw12k0KwtaX02a7bb3" name="iflsig" type="hidden"></span></span></td><td class="fl sblc" align="left" nowrap="" width="25%"><a href="/advanced_search?hl=en-PK&amp;authuser=0">Advanced search</a></td></tr></table><input id="gbv" name="gbv" type="hidden" value="1"><script nonce="t1EKgWOJQagENoMoyIq8zw">(function(){var a,b="1";if(document&&document.getElementById)if("undefined"!=typeof XMLHttpRequest)b="2";else if("undefined"!=typeof ActiveXObject){var c,d,e=["MSXML2.XMLHTTP.6.0","MSXML2.XMLHTTP.3.0","MSXML2.XMLHTTP","Microsoft.XMLHTTP"];for(c=0;d=e[c++];)try{new ActiveXObject(d),b="2"}catch(h){}}a=b;if("2"==a&&-1==location.search.indexOf("&gbv=2")){var f=google.gbvu,g=document.getElementById("gbv");g&&(g.value=a);f&&window.setTimeout(function(){location.href=f},0)};}).call(this);</script></form><div id="gac_scont"></div><div style="font-size:83%;min-height:3.5em"><br><div id="gws-output-pages-elements-homepage_additional_languages__als"><style>#gws-output-pages-elements-homepage_additional_languages__als{font-size:small;margin-bottom:24px}#SIvCob{color:#3c4043;display:inline-block;line-height:28px;}#SIvCob a{padding:0 3px;}.H6sW5{display:inline-block;margin:0 2px;white-space:nowrap}.z4hgWe{display:inline-block;margin:0 2px}</style><div id="SIvCob">Google offered in:  <a dir="rtl" href="https://www.google.com/setprefs?sig=0_r1jPgArqJ4B4gDPx-dpt_P_fF1c%3D&amp;hl=ur&amp;source=homepage&amp;sa=X&amp;ved=0ahUKEwi7w86Hk738AhVwkZUCHQZ8Bm4Q2ZgBCAU">&#1575;&#1585;&#1583;&#1608;</a>    <a dir="rtl" href="https://www.google.com/setprefs?sig=0_r1jPgArqJ4B4gDPx-dpt_P_fF1c%3D&amp;hl=ps&amp;source=homepage&amp;sa=X&amp;ved=0ahUKEwi7w86Hk738AhVwkZUCHQZ8Bm4Q2ZgBCAY">&#1662;&#1690;&#1578;&#1608;</a>    <a dir="rtl" href="https://www.google.com/setprefs?sig=0_r1jPgArqJ4B4gDPx-dpt_P_fF1c%3D&amp;hl=sd&amp;source=homepage&amp;sa=X&amp;ved=0ahUKEwi7w86Hk738AhVwkZUCHQZ8Bm4Q2ZgBCAc">&#1587;&#1606;&#1676;&#1610;</a>  </div></div></div><span id="footer"><div style="font-size:10pt"><div style="margin:19px auto;text-align:center" id="WqQANb"><a href="/intl/en/ads/">Advertising�Programs</a><a href="/intl/en/about.html">About Google</a><a href="https://www.google.com/setprefdomain?prefdom=PK&amp;prev=https://www.google.com.pk/&amp;sig=K_MVkcQYGVALpK2CEzShaNNS889C4%3D">Google.com.pk</a></div></div><p style="font-size:8pt;color:#70757a">&copy; 2023 - <a href="/intl/en/policies/privacy/">Privacy</a> - <a href="/intl/en/policies/terms/">Terms</a></p></span></center><script nonce="t1EKgWOJQagENoMoyIq8zw">(function(){window.google.cdo={height:757,width:1440};(function(){var a=window.innerWidth,b=window.innerHeight;if(!a||!b){var c=window.document,d="CSS1Compat"==c.compatMode?c.documentElement:c.body;a=d.clientWidth;b=d.clientHeight}a&&b&&(a!=google.cdo.width||b!=google.cdo.height)&&google.log("","","/client_204?&atyp=i&biw="+a+"&bih="+b+"&ei="+google.kEI);}).call(this);})();</script> <script nonce="t1EKgWOJQagENoMoyIq8zw">(function(){google.xjs={ck:'xjs.hp.L0TU2uVtv08.L.X.O',cs:'ACT90oHT93DEQlP6_b1H10Ysv5zFk8Lfjw',excm:[]};})();</script>  <script nonce="t1EKgWOJQagENoMoyIq8zw">(function(){var u='/xjs/_/js/kx3dxjs.hp.en.O9Ix1H-6nJI.O/amx3dAADoBABQAGAB/dx3d1/edx3d1/rsx3dACT90oE7kMfbiyrYlGzQlW0b_TuKFV-B6g/mx3dsb_he,d';var amd=0;
var d=this||self,e=function(a){return a};var g;var l=function(a,b){this.g=b===h?a:""};l.prototype.toString=function(){return this.g+""};var h={};
function m(){var a=u;google.lx=function(){p(a);google.lx=function(){}};google.bx||google.lx()}
function p(a){google.timers&&google.timers.load&&google.tick&&google.tick("load","xjsls");var b=document;var c="SCRIPT";"application/xhtml+xml"===b.contentType&&(c=c.toLowerCase());c=b.createElement(c);a=null===a?"null":void 0===a?"undefined":a;if(void 0===g){b=null;var k=d.trustedTypes;if(k&&k.createPolicy){try{b=k.createPolicy("goog#html",{createHTML:e,createScript:e,createScriptURL:e})}catch(q){d.console&&d.console.error(q.message)}g=b}else g=b}a=(b=g)?b.createScriptURL(a):a;a=new l(a,h);c.src=
a instanceof l&&a.constructor===l?a.g:"type_error:TrustedResourceUrl";var f,n;(f=(a=null==(n=(f=(c.ownerDocument&&c.ownerDocument.defaultView||window).document).querySelector)?void 0:n.call(f,"script[nonce]"))?a.nonce||a.getAttribute("nonce")||"":"")&&c.setAttribute("nonce",f);document.body.appendChild(c);google.psa=!0};google.xjsu=u;setTimeout(function(){0<amd?google.caft(function(){return m()},amd):m()},0);})();function _DumpException(e){throw e;}
function _F_installCss(c){}
(function(){google.jl={blt:'none',chnk:0,dw:false,dwu:true,emtn:0,end:0,ico:false,ikb:0,ine:false,injs:'none',injt:0,injth:0,injv2:false,lls:'default',pdt:0,rep:0,snet:true,strt:0,ubm:false,uwp:true};})();(function(){var pmc='{x22dx22:{},x22sb_hex22:{x22agenx22:true,x22cgenx22:true,x22clientx22:x22heirloom-hpx22,x22dhx22:true,x22dsx22:x22x22,x22flx22:true,x22hostx22:x22google.comx22,x22jsonpx22:true,x22msgsx22:{x22ciblx22:x22Clear Searchx22,x22dymx22:x22Did you mean:x22,x22lckyx22:x22I\u0026#39;m Feeling Luckyx22,x22lmlx22:x22Learn morex22,x22psrcx22:x22This search was removed from your \u003Ca hrefx3d\x22/history\x22\u003EWeb History\u003C/a\u003Ex22,x22psrlx22:x22Removex22,x22sbitx22:x22Search by imagex22,x22srchx22:x22Google Searchx22},x22ovrx22:{},x22pqx22:x22x22,x22rfsx22:[],x22sbasx22:x220 3px 8px 0 rgba(0,0,0,0.2),0 0 0 1px rgba(0,0,0,0.08)x22,x22stokx22:x22xGs-7Ks6Q8q3RMls1jXZ4MoFStcx22}}';google.pmc=JSON.parse(pmc);})();</script>        </body></html>

有什么原因导致这种差异吗?谷歌有没有采取措施阻止下载他们的源代码之类的?以及如何在Python中获得网页的原始源代码。

这取决于您的用户代理、语言、饼干等等。请求通常使用python-requests/*用户代理头,因此您需要将其替换为浏览器的头从你的请求中将这些标头值复制到devtools的network选项卡中。我还为url添加了一个等号。

import requests
#The keyword I want to search Google For
search_keyword = "Cats" 
url = r"https://www.google.com/search?q="+search_keyword # ADDED ?q=
# Copied from network tab in dev tools
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5026.0 Safari/537.36 Edg/103.0.1254.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5"
}
#Opening the URL using requests
page = requests.get(url, headers=headers) 
#Break program if there is an error in opening previous link
page.raise_for_status() 
#obtain the source code for the Google Webpage as a String
source_code = page.text
with open("google1.html", "w") as f:
f.write(source_code)

最新更新