Excel VBA宏:从跨多页的站点表中删除数据



提前感谢您的帮助。我运行的是Windows 8.1,我有最新的IE/Chrome浏览器和最新的Excel。我正在尝试编写一个Excel宏,从StackOverflow中提取数据(https://stackoverflow.com/tags)。具体来说,我试图提取日期(宏运行的日期)、标记名称、标记数量以及标记的简要描述。我让它在表的第一页工作,但在其余页不工作(目前有1132页)。现在,每次我运行宏时,它都会覆盖数据,我不知道如何在运行前让它查找下一个空单元格。。最后,我试着让它每周自动运行一次。

我非常感谢这里的任何帮助。问题是:

  1. 从网页表中提取第一页以外的数据
  2. 使其将数据刮到下一个空行,而不是覆盖
  3. 使宏每周自动运行一次

代码(到目前为止)如下。谢谢

Enum READYSTATE
READYSTATE_UNINITIALIZED = 0
READYSTATE_LOADING = 1
READYSTATE_LOADED = 2
READYSTATE_INTERACTIVE = 3
READYSTATE_COMPLETE = 4
End Enum
Sub ImportStackOverflowData()
  'to refer to the running copy of Internet Explorer
  Dim ie As InternetExplorer
  'to refer to the HTML document returned
  Dim html As HTMLDocument
  'open Internet Explorer in memory, and go to website
  Set ie = New InternetExplorer
  ie.Visible = False
  ie.navigate "http://stackoverflow.com/tags"
  'Wait until IE is done loading page
  Do While ie.READYSTATE <> READYSTATE_COMPLETE
    Application.StatusBar = "Trying to go to StackOverflow ..."
    DoEvents
  Loop
  'show text of HTML document returned
  Set html = ie.document
  'close down IE and reset status bar
  Set ie = Nothing
  Application.StatusBar = ""
  'clear old data out and put titles in
  'Cells.Clear
  'put heading across the top of row 3
  Range("A3").Value = "Date Pulled"
  Range("B3").Value = "Keyword"
  Range("C3").Value = "# Of Tags"
  'Range("C3").Value = "Asked This Week"
  Range("D3").Value = "Description"
  Dim TagList As IHTMLElement
  Dim Tags As IHTMLElementCollection
  Dim Tag As IHTMLElement
  Dim RowNumber As Long
  Dim TagFields As IHTMLElementCollection
  Dim TagField As IHTMLElement
  Dim Keyword As String
  Dim NumberOfTags As String
  'Dim AskedThisWeek As String
  Dim TagDescription As String
  'Dim QuestionFieldLinks As IHTMLElementCollection
  Dim TodaysDate As Date
  Set TagList = html.getElementById("tags-browser")
  Set Tags = html.getElementsByClassName("tag-cell")
  RowNumber = 4
  For Each Tag In Tags
    'if this is the tag containing the details, process it
    If Tag.className = "tag-cell" Then
      'get a list of all of the parts of this question,
      'and loop over them
      Set TagFields = Tag.all
      For Each TagField In TagFields
        'if this is the keyword, store it
        If TagField.className = "post-tag" Then
          'store the text value
          Keyword = TagField.innerText
          Cells(RowNumber, 2).Value = TagField.innerText
        End If
        If TagField.className = "item-multiplier-count" Then
          'store the integer for number of tags
          NumberOfTags = TagField.innerText
          'NumberOfTags = Replace(NumberOfTags, "x", "")
          Cells(RowNumber, 3).Value = Trim(NumberOfTags)
        End If
        If TagField.className = "excerpt" Then
          Description = TagField.innerText
          Cells(RowNumber, 4).Value = TagField.innerText
        End If
        TodaysDate = Format(Now, "MM/dd/yy")
        Cells(RowNumber, 1).Value = TodaysDate
      Next TagField
      'go on to next row of worksheet
      RowNumber = RowNumber + 1
    End If
  Next
  Set html = Nothing
  'do some final formatting
  Range("A3").CurrentRegion.WrapText = False
  Range("A3").CurrentRegion.EntireColumn.AutoFit
  Range("A1:C1").EntireColumn.HorizontalAlignment = xlCenter
  Range("A1:D1").Merge
  Range("A1").Value = "StackOverflow Tag Trends"
  Range("A1").Font.Bold = True
  Application.StatusBar = ""
  MsgBox "Done!"
End Sub

当Stack Overflow通过data Explorer等工具为您提供底层数据时,无需刮取堆栈溢出。在数据浏览器中使用此查询应该可以获得所需的结果:

select t.TagName, t.Count, p.Body
 from Tags t inner join Posts p
 on t.ExcerptPostId = p.Id
 order by t.count desc;

该查询的永久链接在这里;下载CSV";选项可能是将数据导入Excel的最简单方法。如果您想自动化这部分内容,那么到CSV下载结果的直接链接是

您可以改进它来解析出确切的元素,但它会循环所有页面并获取所有标签信息(标签旁边的所有信息)

Option Explicit
Public Sub ImportStackOverflowData()
    Dim ie As New InternetExplorer, html As HTMLDocument
    Application.ScreenUpdating = False
    With ie
        .Visible = True
        .navigate "https://stackoverflow.com/tags"
        While .Busy Or .READYSTATE < 4: DoEvents: Wend
        Set html = .document
        Dim numPages As Long, i As Long, info As Object, item As Object, counter As Long
        numPages = html.querySelector(".page-numbers.dots ~ a").innerText
        For i = 1 To 2  ' numPages ''<==1 to 2 for testing; use to numPages
            DoEvents
            Set info = html.getElementById("tags_list")
            For Each item In info.getElementsByClassName("grid-layout--cell tag-cell")
                counter = counter + 1
                Cells(counter, 1) = item.innerText
            Next item
            html.querySelector(".page-numbers.next").Click
            While .Busy Or .READYSTATE < 4: DoEvents: Wend
            Set html = .document
        Next i
        Application.ScreenUpdating = True
        .Quit '<== Remember to quit application
    End With
End Sub

我没有使用DOM,但我发现只在已知标记之间搜索很容易。如果您要查找的表达式太常见,只需稍微调整代码,使其在字符串之后查找字符串)。

一个例子:

Public Sub ZipLookUp()
Dim URL As String, xmlHTTP As Object, html As Object, htmlResponse As String
Dim SStr As String, EStr As String, EndS As Integer, StartS As Integer
Dim Zip4Digit As String
    URL = "https://tools.usps.com/go/ZipLookupResultsAction!input.action?resultMode=1&companyName=&address1=1642+Harmon+Street&address2=&city=Berkeley&state=CA&urbanCode=&postalCode=&zip=94703"
    Set xmlHTTP = CreateObject("MSXML2.XMLHTTP")
    xmlHTTP.Open "GET", URL, False
    On Error GoTo NoConnect
    xmlHTTP.send
    On Error GoTo 0
    Set html = CreateObject("htmlfile")
    htmlResponse = xmlHTTP.ResponseText
    If htmlResponse = Null Then
        MsgBox ("Aborted Run - HTML response was null")
        Application.ScreenUpdating = True
        GoTo End_Prog
    End If
    'Searching for a string within 2 strings
    SStr = "<span class=""address1 range"">" ' first string
    EStr = "</span><br />"                   ' second string
    StartS = InStr(1, htmlResponse, SStr, vbTextCompare) + Len(SStr)
    EndS = InStr(StartS, htmlResponse, EStr, vbTextCompare)
    Zip4Digit = Left(Mid(htmlResponse, StartS, EndS - StartS), 4)
    MsgBox Zip4Digit
GoTo End_Prog
NoConnect:
    If Err = -2147467259 Or Err = -2146697211 Then MsgBox "Error - No Connection": GoTo End_Prog 'MsgBox Err & ": " & Error(Err)
End_Prog:
End Sub

最新更新