Excel Web Scraper-受密码保护的网站



我想访问一个名为valueline的网站,链接:

https://jump.valueline.com/login.aspx

我想登录,因为有些研究是受密码保护的。我试着把我的密码保存到Internet Explorer上,每次我使用上面的链接时,它似乎都会让我登录;但是,当我想访问另一个页面时,它会将我注销。我想从这个页面抓取数据,例如,链接:

https://research.valueline.com/research#sec=company&sym=AAPL

我似乎无法登录,但网页抓取部分工作正常。你可以创建一个试用帐户来测试它。

这是我迄今为止的代码。谢谢你的帮助/建议。

Sub Macro1()
Dim ie As Object
Set Rng = Range("A5:A5")

Set Row = Range(Rng.Offset(1, 0), Rng.Offset(1, 0).End(xlDown))
Set ie = CreateObject("InternetExplorer.Application")
With ie
.Visible = True
For Each Row In Rng
'Login Website
.navigate "https://jump.valueline.com/login.aspx?"
Application.Wait (Now + TimeValue("0:00:05"))
'Research Page
.navigate "https://research.valueline.com/research#sec=company&sym=" & Range("A" & Row.Row).Value
Do
DoEvents
Loop Until ie.readyState = READYSTATE_COMPLETE
Dim doc As HTMLDocument
Set doc = ie.document
While ie.readyState <> 4
Wend
'Application.Wait (Now + TimeValue("0:00:25"))
Dim tblName As Object
Dim span As Object
Dim price As String

On Error Resume Next
'Inserts the Name
'Last Price
Range("B" & Row.Row).Value = doc.getElementsByClassName("alignLeft")(9).innerText
'Dividend yield
Range("C" & Row.Row).Value = doc.getElementsByClassName("alignLeft")(13).innerText
Range("D" & Row.Row).Value = doc.getElementsByClassName("alignLeft")(14).innerText
Range("E" & Row.Row).Value = doc.getElementsByClassName("rank-text")(0).innerText
Range("F" & Row.Row).Value = doc.getElementsByClassName("rank-text")(1).innerText
Range("G" & Row.Row).Value = doc.getElementsByClassName("rank-text")(2).innerText

Next Row

End With
ie.Quit

结束子

尝试此代码,但如果网站发生更改(例如,在第9个前面添加alignLeft class元素(,代码将无法按预期工作。如果网站更改了代码,则应添加其他检查。这就是为什么我建议您使用API,它独立于网站代码。

这段代码仍然有一些丑陋之处(例如,使用On Error Resume Next完成的文档检查(,但已经改进(阅读注释(,应该可以工作。

尚未正常工作!在制品

Sub Macro1()
Dim ie As Object
Dim rng As Excel.Range  'Always declare all vars
Dim row As Excel.Range
Dim wb As Excel.Workbook 'Don't rely on implicit ActiveWorkbook or Sheet, declare!
Dim sh As Excel.Worksheet
Set wb = ThisWorkbook
Set sh = wb.Worksheets(1)
Set rng = sh.Range("A5:A5")

Set row = sh.Range(rng.Offset(1, 0), rng.Offset(1, 0).End(xlDown))
Set ie = CreateObject("InternetExplorer.Application")
With ie
.Visible = True
'Login Website
.navigate "https://jump.valueline.com/login.aspx?"
'Application.Wait (Now + TimeValue("0:00:05")) 'Don't use Application.Wait
Do
DoEvents
Loop Until Not ie.Busy And ie.readyState = 4 'Check for busy too, or better use InternetExploreres Withevent DocumentComplete and check for sth. like pDisp.object = ie.object, I will evaluate this soon.
Dim doc As HTMLDocument
Set doc = .Document
If ie.LocationURL <> "https://jump.valueline.com/Loggedon.aspx" Then 'if loggen in ie gets redirected to loggedon.aspx
doc.getelementbyid("ctl00_ContentPlaceHolder_LoginControl_txtUserID").Value = "valuelinetester@gmail.com" 'fill login form an submit
doc.getelementbyid("ctl00_ContentPlaceHolder_LoginControl_txtUserPw").Value = "Valueline1"
doc.getelementbyid("ctl00_ContentPlaceHolder_LoginControl_btnLogin").Click ''click submit, because the forms code uses some strange javascript I don't know what it does. Usuallay you refer to the form direct and submit it or use a Get/Post request.
Else
'Already logged in
End If
Dim FirstSearchDone As Boolean
FirstSearchDone = False
For Each row In rng
'Research Page
If FirstSearchDone Then
.navigate "https://research.valueline.com/secure/research#sec=company&sym=" & row.Value
Else
If MsgBox("First search has to be done manually. Please type " & row.Value & _
" in searchbox and click on result. After Site is loaded click OK.", vbOKCancel) = vbOK Then 'First search has to be done manually
FirstSearch = True
Else
Exit Sub
End If
End If

Do
DoEvents
Loop Until Not ie.Busy And ie.readyState = 4
On Error Resume Next ' If IE not ready error occurs and loop starts again
Do
Err.Clear 'clear error to detect ie.document set
'Inserts the Name
'Last Price
With sh 'uses explicit sheet instead of former implicit activesheet
.Range("B" & row.row).Value = doc.getElementsByClassName("alignLeft")(9).innerText
'Dividend yield
.Range("C" & row.row).Value = doc.getElementsByClassName("alignLeft")(13).innerText
.Range("D" & row.row).Value = doc.getElementsByClassName("alignLeft")(14).innerText
.Range("E" & row.row).Value = doc.getElementsByClassName("rank-text")(0).innerText
.Range("F" & row.row).Value = doc.getElementsByClassName("rank-text")(1).innerText
.Range("G" & row.row).Value = doc.getElementsByClassName("rank-text")(2).innerText
End With
Loop While Err.Number = 91 'error if ie.document not set

On Error GoTo 0
Err.Clear
Next row
.Quit
End With
Set ie = Nothing
End Sub

最新更新