我想从其中一个网站下载zip文件https://eqrreportviewer.ferc.gov/.下载zip文件的方式是首先单击"归档查询"选项卡。在reportType下拉列表中,选择SubmissionsBydate,在export下拉列表中选择CSV。现在点击提交按钮,zip文件就会被下载。我想把这个过程自动化。我已经用C#编写了一段代码,通过捕获请求及其标头并将详细信息传递给网站,但我无法通过代码下载文件。
这是我写的代码:
public static string PageSourceCode { get; set; }
//The ASP.NET SessionID to add validation to posts
public static string SessionID { get; set; }
//The value we are posting to the page on subsequent calls
public static string PostBackValue { get; set; }
public static string AcquisitionURL = "https://eqrreportviewer.ferc.gov";
static void Main(string[] args)
{
Acquire();
}
private static void Acquire()
{
GetLandingPage();
PopulatePostBackValueForSubmitBtn();
PostToPageForSubmitBtn();
}
private static void GetLandingPage()
{
string mainPageOutput = string.Empty;
HttpWebRequest objRequestLandingPage = (HttpWebRequest)WebRequest.Create(AcquisitionURL);
objRequestLandingPage.Method = WebRequestMethods.Http.Get;
objRequestLandingPage.Headers.Add("Cache-Control", "max-age=0");
objRequestLandingPage.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9";
objRequestLandingPage.Headers.Add("Accept-Encoding", "gzip, deflate, br");
objRequestLandingPage.Headers.Add("Accept-Language", "en-US,en;q=0.9");
objRequestLandingPage.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36";
objRequestLandingPage.Headers.Add("Sec-Fetch-Dest", "document");
objRequestLandingPage.Headers.Add("Sec-Fetch-Mode", "navigate");
objRequestLandingPage.Headers.Add("Sec-Fetch-Site", "none");
objRequestLandingPage.Headers.Add("Sec-Fetch-User", "?1");
objRequestLandingPage.Headers.Add("Upgrade-Insecure-Requests", "1");
//objRequestLandingPage.Headers.Add("Connection", "keep-alive");
objRequestLandingPage.KeepAlive = true;
objRequestLandingPage.Host = "eqrreportviewer.ferc.gov";
using (WebResponse objResponseLandingPage = objRequestLandingPage.GetResponse())
{
WebHeaderCollection headers = objResponseLandingPage.Headers;
using (Stream streamLandingPage = objResponseLandingPage.GetResponseStream())
using (StreamReader streamReaderLandingPage = new StreamReader(streamLandingPage))
{
mainPageOutput = streamReaderLandingPage.ReadToEnd();
}
SessionID = headers["Set-Cookie"];
}
SessionID = StripCookie(SessionID);
//Set the source code of the page
PageSourceCode = mainPageOutput;
}
private static void PopulatePostBackValueForSubmitBtn()
{
if (!String.IsNullOrEmpty(PageSourceCode))
{
// get fields from landing page
Dictionary<string, string> formFields = GetFormFields(PageSourceCode);
formFields["TabContainerReportViewer$TabPanelReporting$TabContainerReports$TabPanelSummaryReports$ddlReportTypeSum"] = "0";
formFields["TabContainerReportViewer$TabPanelReporting$TabContainerReports$TabPanelSummaryReports$ddlReportPeriodSum"] = "650";
formFields["TabContainerReportViewer$TabPanelReporting$TabContainerReports$TabPanelSummaryReports$ListSearchExtender1_ClientState"] = String.Empty;
formFields["TabContainerReportViewer$TabPanelReporting$TabContainerReports$TabPanelFilingInquiries$ddlReportType"] = "4";
formFields["TabContainerReportViewer$TabPanelReporting$TabContainerReports$TabPanelFilingInquiries$txtFromSubmissionDate"] = System.DateTime.Now.Date.AddDays(-30).ToShortDateString();
formFields["TabContainerReportViewer$TabPanelReporting$TabContainerReports$TabPanelFilingInquiries$txtToSubmissionDate"] = System.DateTime.Now.Date.ToShortDateString();
formFields["TabContainerReportViewer$TabPanelReporting$TabContainerReports$TabPanelFilingInquiries$ddlExport"] = "2";
formFields["TabContainerReportViewer$TabPanelReporting$TabContainerReports$TabPanelFilingInquiries$btnSubmitOptional"] = "Submit";
formFields["TabContainerReportViewer$TabPanelDownloads$TabContainerDownloads$TabPanelSelectiveFilings$txtCID"] = String.Empty;
formFields["TabContainerReportViewer$TabPanelDownloads$TabContainerDownloads$TabPanelSelectiveFilings$txtFilingOrg"] = String.Empty;
formFields["TabContainerReportViewer$TabPanelDownloads$TabContainerDownloads$TabPanelSelectiveFilings$ddlQuarter"] = "Pick";
formFields["TabContainerReportViewer$TabPanelDownloads$TabContainerDownloads$TabPanelSelectiveFilings$ddlDownloadType"] = "CSV";
formFields["TabContainerReportViewer$TabPanelDownloads$TabContainerDownloads$TabPanelSelectiveFilings$txtName"] = String.Empty;
formFields["TabContainerReportViewer$TabPanelDownloads$TabContainerDownloads$TabPanelSelectiveFilings$txtEmail"] = String.Empty;
formFields["__EVENTTARGET"] = String.Empty;
formFields["__EVENTARGUMENT"] = String.Empty;
formFields["__LASTFOCUS"] = String.Empty;
formFields["__AjaxControlToolkitCalendarCssLoaded"] = String.Empty;
formFields["TabContainerReportViewer_ClientState"] = "{"ActiveTabIndex" : 0,"TabState": [true,true]}";
formFields["TabContainerReportViewer_TabPanelReporting_TabContainerReports_ClientState"] = "{"ActiveTabIndex" : 1,"TabState": [true,true]}";
formFields["TabContainerReportViewer_TabPanelDownloads_TabContainerDownloads_ClientState"] = "{"ActiveTabIndex" : 0,"TabState": [true,true]}";
formFields["__VIEWSTATE"] = ViewState;
formFields["__VIEWSTATEGENERATOR"] = ViewStateGenerator;
formFields["__VIEWSTATEENCRYPTED"] = ViewStateEncrypted;
string postString = FormatPostString(formFields);
PostBackValue = postString;
}
}
private static void PostToPageForSubmitBtn()
{
HttpWebRequest objRequestPostPage = (HttpWebRequest)WebRequest.Create(AcquisitionURL);
objRequestPostPage.Method = WebRequestMethods.Http.Post;
objRequestPostPage.ContentLength = PostBackValue.Length;
objRequestPostPage.ContentType = "application/x-www-form-urlencoded";
objRequestPostPage.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9";
objRequestPostPage.KeepAlive = true;
objRequestPostPage.Host = "eqrreportviewer.ferc.gov";
objRequestPostPage.Headers.Add("Cache-Control", "max-age=0");
objRequestPostPage.Headers.Add("Sec-Fetch-Dest", "document");
objRequestPostPage.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36";
objRequestPostPage.Headers.Add("Origin", "https://eqrreportviewer.ferc.gov");
objRequestPostPage.Headers.Add("Sec-Fetch-Site", "same-origin");
objRequestPostPage.Headers.Add("Sec-Fetch-Mode", "navigate");
objRequestPostPage.Referer = "https://eqrreportviewer.ferc.gov/";
objRequestPostPage.Headers.Add("Accept-Encoding", "gzip, deflate,br");
objRequestPostPage.Headers.Add("Accept-Language", "en-US,en;q=0.9");
//Pass in the ASP.NET Session ID
objRequestPostPage.Headers.Add("Cookie", SessionID);
objRequestPostPage.Headers.Add("Upgrade-Insecure-Requests", "1");
objRequestPostPage.Headers.Add("Sec-Fetch-User", "?1");
objRequestPostPage.ServicePoint.Expect100Continue = false;
StreamWriter streamWriterPostPage = new StreamWriter(objRequestPostPage.GetRequestStream());
//Post the arguments
streamWriterPostPage.Write(PostBackValue);
streamWriterPostPage.Close();
//Get response
HttpWebResponse responsePostPage = (HttpWebResponse)objRequestPostPage.GetResponse();
WebHeaderCollection responseHeaders = responsePostPage.Headers;
Stream responseStream = responsePostPage.GetResponseStream();
StreamReader reader = new StreamReader(responseStream);
PageSourceCode = reader.ReadToEnd();
using (FileStream file = new FileStream(@"C:Testtest.csv", FileMode.Create, FileAccess.Write))
{
WriteFile(responseStream, file);
}
}
有人能告诉我我是否做错了什么吗。现在所有的值都是硬编码的,但如果它有效,我可以正确地组织它
此外,我在收到的响应中没有得到Content-Disposition响应标头,但当它从Chrome浏览器运行时,我会得到这个标头。
我可以做哪些不同的代码,或者如果我遗漏了什么?任何帮助/建议都将大大有助于推进这一问题。
我无法使用C#完成此操作
最后,我使用python与selenium和chrome web驱动程序相结合来完成任务。
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--disable-extensions")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_experimental_option("prefs", {"download.default_directory":"/databricks/driver"})
driver = webdriver.Chrome(chrome_options=options)
driver.implicitly_wait(5)
url = "https://eqrreportviewer.ferc.gov/"
driver.get(url)
driver.implicitly_wait(5)
#Filing Inquiries
driver.find_element_by_xpath('//*[@id="__tab_TabContainerReportViewer_TabPanelReporting_TabContainerReports_TabPanelFilingInquiries"]').click()
driver.implicitly_wait(5)
#Submission by Date
driver.find_element_by_xpath('//*[@id="TabContainerReportViewer_TabPanelReporting_TabContainerReports_TabPanelFilingInquiries_ddlReportType"]/option[5]').click()
driver.implicitly_wait(5)
#CSV
driver.find_element_by_xpath('//*[@id="TabContainerReportViewer_TabPanelReporting_TabContainerReports_TabPanelFilingInquiries_ddlExport"]/option[2]').click()
driver.implicitly_wait(15)
#Submit
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, '//*[@id="TabContainerReportViewer_TabPanelReporting_TabContainerReports_TabPanelFilingInquiries_btnSubmitOptional"]')))
element.click()
driver.implicitly_wait(15) #putting wait here to make sure file gets downloaded before driver is stopped.
driver.quit()