使用VBA HTML从网页下载文件



几个月来,我一直在努力实现csv文件下载、管理和保存到指定位置的自动化过程。到目前为止,我只管理与excel vba打开网页,点击底部下载csv文件,但代码停止并需要人工干预完成,我希望它是完全自动化的,如果可能的话。查看使用的代码(我不是作者):

Sub WebDataExtraction()
Dim URL As String
Dim IeApp As Object
Dim IeDoc As Object
Dim ieForm As Object
Dim ieObj As Object
Dim objColl As Collection
URL = "http://www.bmreports.com/bsp/BMRSSystemData.php?pT=DDAD&zT=N&dT=NRT"
Set IeApp = CreateObject("InternetExplorer.Application")
IeApp.Visible = True
IeApp.Navigate URL
Do Until IeApp.ReadyState = READYSTATE_COMPLETE
Loop
Set IeDoc = IeApp.Document
For Each ele In IeApp.Document.getElementsByTagName("span")
If ele.innerHTML = "CSV" Then
Application.Wait (Now + TimeValue("0:00:15"))
DoEvents
ele.Click
'At this point you need to Save the document manually
' or figure out for yourself how to automate this interaction.
Test_Save_As_Set_Filename
File_Download_Click_Save
End If
Next
IeApp.Quit
End Sub"

thanks in advance

Nunzio

我正在发布第二个答案,因为我相信我的第一个答案对于许多类似的应用程序来说是足够的,它在这种情况下不起作用。

其他方法失败的原因:

  • .Click方法:这会引发一个新的窗口,在运行时需要用户输入,似乎不可能使用WinAPI来控制这个窗口。或者,至少不是我能确定的。代码执行停止在.Click行,直到用户手动干预,没有办法使用GoToWait或任何其他方法来规避这种行为。
  • 使用WinAPI函数直接下载源文件不起作用,因为按钮的URL不包含文件,而是一个动态服务于文件的js函数。

这是我提出的解决方案:

您可以读取网页的.body.InnerText,使用FileSystemObject将其写入纯文本/csv文件,然后结合Regular Expressions和字符串操作,将数据解析成适当分隔的csv文件。

Sub WebDataExtraction()
    Dim url As String
    Dim fName As String
    Dim lnText As String
    Dim varLine() As Variant
    Dim vLn As Variant
    Dim newText As String
    Dim leftText As String
    Dim breakTime As Date
'## Requires reference to Microsoft VBScript Regular Expressions 5.5
    Dim REMatches As MatchCollection
    Dim m As Match
'## Requires reference to Microsoft Internet Controls
    Dim IeApp As InternetExplorer
'## Requires reference to Microsoft HTML object library
    Dim IeDoc As HTMLDocument
    Dim ele As HTMLFormElement
'## Requires reference to Microsoft Scripting Runtime
    Dim fso As FileSystemObject
    Dim f As TextStream
    Dim ln As Long: ln = 1

    breakTime = DateAdd("s", 60, Now)
    url = "http://www.bmreports.com/bsp/BMRSSystemData.php?pT=DDAD&zT=N&dT=NRT"
    Set IeApp = CreateObject("InternetExplorer.Application")
    With IeApp
        .Visible = True
        .Navigate url
        Do Until .ReadyState = 4
        Loop
        Set IeDoc = .Document
    End With
    'Wait for the data to display on the page
    Do
        If Now >= breakTime Then
            If MsgBox("The website is taking longer than usual, would you like to continue waiting?", vbYesNo) = vbNo Then
                GoTo EarlyExit
            Else:
                breakTime = DateAdd("s", 60, Now)
            End If
        End If
    Loop While Trim(IeDoc.body.innerText) = "XML CSV Please Wait Data Loading Sorting"
    '## Create the text file
    fName = ActiveWorkbook.Path & "exported-csv.csv"
    Set fso = CreateObject("Scripting.FileSystemObject")
    Set f = fso.OpenTextFile(fName, 2, True, -1)
    f.Write IeDoc.body.innerText
    f.Close
    Set f = Nothing
    '## Read the text file
    Set f = fso.OpenTextFile(fName, 1, False, -1)
    Do
        lnText = f.ReadLine
        '## The data starts on the 4th line in the InnerText.
        If ln >= 4 Then
            '## Return a collection of matching date/timestamps to which we can parse
            Set REMatches = SplitLine(lnText)
            newText = lnText
            For Each m In REMatches
                newText = Replace(newText, m.Value, ("," & m.Value & ","), , -1, vbTextCompare)
            Next
            '## Get rid of consecutive delimiters:
            Do
                newText = Replace(newText, ",,", ",")
            Loop While InStr(1, newText, ",,", vbBinaryCompare) <> 0
            '## Then use some string manipulation to parse out the first 2 columns which are
            '   not a match to the RegExp we used above.
            leftText = Left(newText, InStr(1, newText, ",", vbTextCompare) - 1)
            leftText = Left(leftText, 10) & "," & Right(leftText, Len(leftText) - 10)
            newText = Right(newText, Len(newText) - InStr(1, newText, ",", vbTextCompare))
            newText = leftText & "," & newText
            '## Store these lines in an array
            ReDim Preserve varLine(ln - 4)
            varLine(ln - 4) = newText
        End If
        ln = ln + 1
    Loop While Not f.AtEndOfStream
    f.Close
'## Re-open the file for writing the delimited lines:
    Set f = fso.OpenTextFile(fName, 2, True, -1)
    '## Iterate over the array and write the data in CSV:
    For Each vLn In varLine
        'Omit blank lines, if any.
        If Len(vLn) <> 0 Then f.WriteLine vLn
    Next
    f.Close
EarlyExit:
    Set fso = Nothing
    Set f = Nothing
    IeApp.Quit
    Set IeApp = Nothing
End Sub
Function SplitLine(strLine As String) As MatchCollection
'returns a RegExp MatchCollection of Date/Timestamps found in each line
'## Requires reference to Microsoft VBScript Regular Expressions 5.5
Dim RE As RegExp
Dim matches As MatchCollection
    Set RE = CreateObject("vbscript.regexp")
    With RE
        .MultiLine = False
        .Global = True
        .IgnoreCase = True
        '## Use this RegEx pattern to parse the date & timestamps:
        .Pattern = "(19|20)dd[-](0[1-9]|1[012])[-](0[1-9]|[12][0-9]|3[01])[ ]dd?:dd:dd"
    End With
    Set matches = RE.Execute(strLine)
    Set SplitLine = matches
End Function

EDIT

我测试了我的原始答案代码使用URL:

http://www.bmreports.com/bsp/BMRSSystemData.php?pT=DDAD&zT=N&dT=NRT#saveasCSV

但这种方法似乎不工作,对于这个网站。ele.Click似乎没有启动下载,它只是打开了网页上的数据表格。要下载,您需要右键单击/另存为。如果您已经做到了这一点(我怀疑,基于您正在调用的子例程,但您没有提供代码),那么您可能可以使用Win API来获取保存对话框的HWND,并可能自动执行该事件。Santosh提供了一些相关信息:

VBA -去网站和下载文件从保存提示

这里也有一个很好的资源,应该可以帮助解决你的问题:

http://social.msdn.microsoft.com/Forums/en-US/beb6fa0e-fbc8-49df-9f2e-30f85d941fad/download-file-from-ie-with-vba

原始回答

如果能够确定CSV的URL,则可以使用此子例程打开到CSV数据的连接并将其直接导入工作簿。您可能需要在导入的数据上自动执行文本到列的操作,但是可以很容易地使用宏记录器进行复制。我在下面的Test()子程序中放了一个这样的例子。

您可以很容易地修改它,将QueryTables添加到新的工作簿中,然后在该工作簿上自动执行SaveAs方法以将文件保存为CSV。

此示例使用Yahoo Finance, Ford Motor Company的已知URL,并将在活动工作表的单元格A1中添加QueryTables和CSV数据。这可以很容易地修改,把它放在另一个工作表,另一个工作簿,等等。

Sub Test()
Dim MyURL as String
MyURL = "http://ichart.finance.yahoo.com/table.csv?s=GM&a0&b=1&c2010&d=05&e=20&f=2013&g=d&ignore=.csv"
OpenURL MyURL
'Explode the CSV data:
Range("A:A").TextToColumns Destination:=Range("A1"), DataType:=xlDelimited, _
    TextQualifier:=xlDoubleQuote, ConsecutiveDelimiter:=False, Tab:=False, _
    Semicolon:=False, Comma:=True, Space:=False, Other:=False, FieldInfo _
    :=Array(Array(1, 3), Array(2, 1), Array(3, 1), Array(4, 1), Array(5, 1), Array(6, 1), _
    Array(7, 1)), TrailingMinusNumbers:=True
End Sub
Private Sub OpenURL(fullURL As String)
'This opens the CSV in querytables connection.
On Error GoTo ErrOpenURL
    With ActiveSheet.QueryTables.Add(Connection:= _
        "URL;" & fullURL, Destination:=Range("A1"))
        .Name = fullURL
        .FieldNames = True
        .RowNumbers = False
        .FillAdjacentFormulas = False
        .PreserveFormatting = True
        .RefreshOnFileOpen = False
        .BackgroundQuery = True
        .RefreshStyle = xlInsertDeleteCells
        .SavePassword = True
        .SaveData = True
        .AdjustColumnWidth = True
        .RefreshPeriod = 0
        .WebSelectionType = xlEntirePage
        .WebFormatting = xlWebFormattingAll
        .WebPreFormattedTextToColumns = True
        .WebConsecutiveDelimitersAsOne = True
        .WebSingleBlockTextImport = False
        .WebDisableDateRecognition = False
        .WebDisableRedirections = False
        .Refresh BackgroundQuery:=False
    End With
ExitOpenURL:
Exit Sub 'if all goes well, you can exit
'Error handling...
ErrOpenURL:
Err.Clear
bCancel = True
Resume ExitOpenURL

End Sub

最新更新