在出色的成员@QHarr的帮助下,我有了以下代码,可以从html本地文件中抓取数据,这是一个非常好的
Sub Test()
Dim html As HTMLDocument, tables As Object, ws As Worksheet, fStream As ADODB.Stream
Dim headers(), mappings(), arr(13), newarr(13), cnt As Long, i As Long, j As Long, n As Long
Dim xFd As FileDialog, sFile As Variant, sSchool As String, sFolder As String, x As Long
Set ws = ThisWorkbook.Worksheets("Results")
Set html = New HTMLDocument
Set fStream = New ADODB.Stream
Set xFd = Application.FileDialog(msoFileDialogFolderPicker)
xFd.Title = "Please Select The Original Folder:"
If xFd.Show = -1 Then sFolder = xFd.SelectedItems(1) & "" Else Exit Sub
sSchool = Split(sFolder, "")(UBound(Split(sFolder, "")) - 1)
sFile = Dir(sFolder)
cnt = ws.Cells(Rows.Count, 1).End(xlUp).Row: x = cnt
headers = Array("م", "كود الطالب", "الرقم القومي", "اسم الطالب", "الجنسية", "الديانة", "تاريخ الميلاد", "يوم", "شهر", "سنة", "محافظة الميلاد", "حالة القيد", "النوع", "ملاحظات")
mappings = Array(3, 8, 9, 12, 11, 10, 2, 7, 1, 6, 5, 4, 13)
If IsEmpty(ws.Cells(1, 1).Value) Then ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
Application.ScreenUpdating = False
While sFile <> ""
With fStream
.Charset = "UTF-8"
.Open
.LoadFromFile sFolder & sFile
html.body.innerHTML = .ReadText
.Close
End With
Set tables = html.querySelectorAll("table[width='100%'] table:first-child")
For i = 89 To tables.Length - 17 Step 26
Erase arr
arr(0) = vbNullString
For j = 0 To 12
arr(mappings(j)) = Application.Trim(tables.Item(i + (2 * (j))).innerText)
If j = 4 And arr(3) = "غير مصرى" Then arr(mappings(j)) = 0
Next j
For j = UBound(arr) To LBound(arr) Step -1
newarr(n) = arr(j)
If n = 6 Then
If IsDate(newarr) Then newarr(n) = CDate(Day(newarr(n)) & "/" & Month(newarr(n)) & "/" & Year(newarr(n)))
End If
n = n + 1
Next j
ws.Cells(cnt + 1, 1).Resize(1, UBound(arr) + 1) = newarr
cnt = cnt + 1: n = 0
Next i
sFile = Dir
Wend
ws.Cells(x + 1, 14).Resize(cnt - x).Value = sSchool
ws.Activate
Application.ScreenUpdating = True
End Sub
当没有国籍id(html表中的第三列(时,结果的唯一问题是الرقمااي当它为空时,我没有得到正确的名称和以下名称的结果如果您运行代码,请注意从11到17的行。。。附件是在这个链接上有文件的FolderToTest
我试着解决了这个问题,结果有所调整(但仍然不正确,因为国籍id为空的名字不见了,下面的名字有他的一些数据(这是我最后一次尝试
Sub Test()
Dim html As HTMLDocument, tables As Object, ws As Worksheet, fStream As ADODB.Stream
Dim headers(), mappings(), arr(13), newarr(13), cnt As Long, i As Long, j As Long, n As Long
Dim xFd As FileDialog, sFile As Variant, sSchool As String, sFolder As String, x As Long
Set ws = ThisWorkbook.Worksheets("Results")
Set html = New HTMLDocument
Set fStream = New ADODB.Stream
Set xFd = Application.FileDialog(msoFileDialogFolderPicker)
xFd.Title = "Please Select The Original Folder:"
If xFd.Show = -1 Then sFolder = xFd.SelectedItems(1) & "" Else Exit Sub
sSchool = Split(sFolder, "")(UBound(Split(sFolder, "")) - 1)
sFile = Dir(sFolder)
cnt = ws.Cells(Rows.Count, 1).End(xlUp).Row: x = cnt
headers = Array("م", "كود الطالب", "الرقم القومي", "اسم الطالب", "الجنسية", "الديانة", "تاريخ الميلاد", "يوم", "شهر", "سنة", "محافظة الميلاد", "حالة القيد", "النوع", "ملاحظات")
mappings = Array(3, 8, 9, 12, 11, 10, 2, 7, 1, 6, 5, 4, 13)
If IsEmpty(ws.Cells(1, 1).Value) Then ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
Application.ScreenUpdating = False
While sFile <> ""
With fStream
.Charset = "UTF-8"
.Open
.LoadFromFile sFolder & sFile
html.body.innerHTML = .ReadText
.Close
End With
Set tables = html.querySelectorAll("table[width='100%'] table:first-child")
For i = 89 To tables.Length - 17 Step 26
Erase arr
arr(0) = vbNullString
For j = 0 To 12
arr(mappings(j)) = Application.Trim(tables.Item(i + (2 * (j))).innerText)
'If j = 4 And arr(3) = "غير مصرى" Then arr(mappings(j)) = 0
If j = 3 And Not IsNumeric(Application.Trim(tables.Item(i + (2 * (j)) + 2).innerText)) Then
i = i + 24
End If
Next j
For j = UBound(arr) To LBound(arr) Step -1
newarr(n) = arr(j)
If n = 6 Then
newarr(n) = CDate(Day(newarr(n)) & "/" & Month(newarr(n)) & "/" & Year(newarr(n)))
End If
n = n + 1
Next j
ws.Cells(cnt + 1, 1).Resize(1, UBound(arr) + 1) = newarr
cnt = cnt + 1: n = 0
Next i
sFile = Dir
Wend
ws.Cells(x + 1, 14).Resize(cnt - x).Value = sSchool
ws.Activate
Application.ScreenUpdating = True
End Sub
您可以通过以下方式对此进行细化。我使用Select Case
在محافظة الميلاد
的值的基础上测试الرقــم القومــي
中的缺失值。如果محافظة الميلاد
是غير مصرى
,那么我假设稍后会有一个空值,并相应地调整我用来填充数组的c
计数器。我将更新以删除一些不必要的硬编码。
请注意,这里的映射也与您以前的文件不同。
Option Explicit
Public Sub ParseInfo()
Dim html As HTMLDocument, tables As Object, ws As Worksheet, i As Long
Set ws = ThisWorkbook.Worksheets("Results")
Dim fStream As ADODB.Stream
Set html = New HTMLDocument
Set fStream = New ADODB.Stream
With fStream
.Charset = "UTF-8"
.Open
.LoadFromFile "C:UsersUserDesktoptest.html"
html.body.innerHTML = .ReadText
.Close
End With
Dim r As Long, c As Long, currentItem As Variant, missingValueFlag As Boolean
Set tables = html.querySelectorAll("table")
Dim mappings(), arr()
ReDim arr(12)
mappings = Array(2, 7, 8, 11, 10, 9, 1, 6, 0, 5, 4, 3, 12)
r = 1: c = 1
For i = 91 To 504 Step 2
currentItem = tables.item(i).innerText
Select Case c
Case 1
If currentItem = "غير مصرى" Then
missingValueFlag = True
End If
Case 5
If missingValueFlag Then c = c + 1
End Select
arr(mappings(c - 1)) = currentItem
If c = 13 Then
ws.Cells(r, 1).Resize(1, UBound(arr) + 1) = arr
c = 1: r = r + 1
missingValueFlag = False
ReDim arr(12)
Else
c = c + 1
End If
Next
End Sub