从本地html文件导入数据后调整结果



在出色的成员@QHarr的帮助下,我有了以下代码,可以从html本地文件中抓取数据,这是一个非常好的

Sub Test()
Dim html As HTMLDocument, tables As Object, ws As Worksheet, fStream As ADODB.Stream
Dim headers(), mappings(), arr(13), newarr(13), cnt As Long, i As Long, j As Long, n As Long
Dim xFd As FileDialog, sFile As Variant, sSchool As String, sFolder As String, x As Long
Set ws = ThisWorkbook.Worksheets("Results")
Set html = New HTMLDocument
Set fStream = New ADODB.Stream
Set xFd = Application.FileDialog(msoFileDialogFolderPicker)
xFd.Title = "Please Select The Original Folder:"
If xFd.Show = -1 Then sFolder = xFd.SelectedItems(1) & "" Else Exit Sub
sSchool = Split(sFolder, "")(UBound(Split(sFolder, "")) - 1)
sFile = Dir(sFolder)
cnt = ws.Cells(Rows.Count, 1).End(xlUp).Row: x = cnt
headers = Array("م", "كود الطالب", "الرقم القومي", "اسم الطالب", "الجنسية", "الديانة", "تاريخ الميلاد", "يوم", "شهر", "سنة", "محافظة الميلاد", "حالة القيد", "النوع", "ملاحظات")
mappings = Array(3, 8, 9, 12, 11, 10, 2, 7, 1, 6, 5, 4, 13)
If IsEmpty(ws.Cells(1, 1).Value) Then ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
Application.ScreenUpdating = False
While sFile <> ""
With fStream
.Charset = "UTF-8"
.Open
.LoadFromFile sFolder & sFile
html.body.innerHTML = .ReadText
.Close
End With
Set tables = html.querySelectorAll("table[width='100%'] table:first-child")
For i = 89 To tables.Length - 17 Step 26
Erase arr
arr(0) = vbNullString
For j = 0 To 12
arr(mappings(j)) = Application.Trim(tables.Item(i + (2 * (j))).innerText)
If j = 4 And arr(3) = "غير مصرى‏" Then arr(mappings(j)) = 0
Next j
For j = UBound(arr) To LBound(arr) Step -1
newarr(n) = arr(j)
If n = 6 Then
If IsDate(newarr) Then newarr(n) = CDate(Day(newarr(n)) & "/" & Month(newarr(n)) & "/" & Year(newarr(n)))
End If
n = n + 1
Next j
ws.Cells(cnt + 1, 1).Resize(1, UBound(arr) + 1) = newarr
cnt = cnt + 1: n = 0
Next i
sFile = Dir
Wend
ws.Cells(x + 1, 14).Resize(cnt - x).Value = sSchool
ws.Activate
Application.ScreenUpdating = True
End Sub

当没有国籍id(html表中的第三列(时,结果的唯一问题是الرقمااي当它为空时,我没有得到正确的名称和以下名称的结果如果您运行代码,请注意从11到17的行。。。附件是在这个链接上有文件的FolderToTest

我试着解决了这个问题,结果有所调整(但仍然不正确,因为国籍id为空的名字不见了,下面的名字有他的一些数据(这是我最后一次尝试

Sub Test()
Dim html As HTMLDocument, tables As Object, ws As Worksheet, fStream As ADODB.Stream
Dim headers(), mappings(), arr(13), newarr(13), cnt As Long, i As Long, j As Long, n As Long
Dim xFd As FileDialog, sFile As Variant, sSchool As String, sFolder As String, x As Long
Set ws = ThisWorkbook.Worksheets("Results")
Set html = New HTMLDocument
Set fStream = New ADODB.Stream
Set xFd = Application.FileDialog(msoFileDialogFolderPicker)
xFd.Title = "Please Select The Original Folder:"
If xFd.Show = -1 Then sFolder = xFd.SelectedItems(1) & "" Else Exit Sub
sSchool = Split(sFolder, "")(UBound(Split(sFolder, "")) - 1)
sFile = Dir(sFolder)
cnt = ws.Cells(Rows.Count, 1).End(xlUp).Row: x = cnt
headers = Array("م", "كود الطالب", "الرقم القومي", "اسم الطالب", "الجنسية", "الديانة", "تاريخ الميلاد", "يوم", "شهر", "سنة", "محافظة الميلاد", "حالة القيد", "النوع", "ملاحظات")
mappings = Array(3, 8, 9, 12, 11, 10, 2, 7, 1, 6, 5, 4, 13)
If IsEmpty(ws.Cells(1, 1).Value) Then ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
Application.ScreenUpdating = False
While sFile <> ""
With fStream
.Charset = "UTF-8"
.Open
.LoadFromFile sFolder & sFile
html.body.innerHTML = .ReadText
.Close
End With
Set tables = html.querySelectorAll("table[width='100%'] table:first-child")
For i = 89 To tables.Length - 17 Step 26
Erase arr
arr(0) = vbNullString
For j = 0 To 12
arr(mappings(j)) = Application.Trim(tables.Item(i + (2 * (j))).innerText)
'If j = 4 And arr(3) = "غير مصرى‏" Then arr(mappings(j)) = 0
If j = 3 And Not IsNumeric(Application.Trim(tables.Item(i + (2 * (j)) + 2).innerText)) Then
i = i + 24
End If
Next j
For j = UBound(arr) To LBound(arr) Step -1
newarr(n) = arr(j)
If n = 6 Then
newarr(n) = CDate(Day(newarr(n)) & "/" & Month(newarr(n)) & "/" & Year(newarr(n)))
End If
n = n + 1
Next j
ws.Cells(cnt + 1, 1).Resize(1, UBound(arr) + 1) = newarr
cnt = cnt + 1: n = 0
Next i
sFile = Dir
Wend
ws.Cells(x + 1, 14).Resize(cnt - x).Value = sSchool
ws.Activate
Application.ScreenUpdating = True
End Sub

您可以通过以下方式对此进行细化。我使用Select Caseمحافظة الميلاد‎的值的基础上测试الرقــم القومــي‎中的缺失值。如果محافظة الميلاد‎غير مصرى‏,那么我假设稍后会有一个空值,并相应地调整我用来填充数组的c计数器。我将更新以删除一些不必要的硬编码。

请注意,这里的映射也与您以前的文件不同。

Option Explicit
Public Sub ParseInfo()
Dim html As HTMLDocument, tables As Object, ws As Worksheet, i As Long
Set ws = ThisWorkbook.Worksheets("Results")
Dim fStream  As ADODB.Stream
Set html = New HTMLDocument
Set fStream = New ADODB.Stream
With fStream
.Charset = "UTF-8"
.Open
.LoadFromFile "C:UsersUserDesktoptest.html"
html.body.innerHTML = .ReadText
.Close
End With
Dim r As Long, c As Long, currentItem As Variant, missingValueFlag As Boolean
Set tables = html.querySelectorAll("table")
Dim mappings(), arr()
ReDim arr(12)
mappings = Array(2, 7, 8, 11, 10, 9, 1, 6, 0, 5, 4, 3, 12)
r = 1: c = 1
For i = 91 To 504 Step 2
currentItem = tables.item(i).innerText
Select Case c
Case 1
If currentItem = "غير مصرى‏" Then  
missingValueFlag = True
End If
Case 5
If missingValueFlag Then c = c + 1
End Select
arr(mappings(c - 1)) = currentItem
If c = 13 Then
ws.Cells(r, 1).Resize(1, UBound(arr) + 1) = arr
c = 1: r = r + 1
missingValueFlag = False
ReDim arr(12)
Else
c = c + 1
End If
Next
End Sub

最新更新