因此,我有以下代码从网站上拉出HTML片段。
这是一种工作,在parse.txt
文件中,我可以看到 innerHTML
,恰好是我想要的html。
但是,html文件中有更多,它具有所有标题和页脚,这在文本文件中的 innerHTML
对象中未显示。
我要做的就是仅保存在HTML文件中的对象(Inner HTML)。
$ie = New-Object -com InternetExplorer.Application
$ie.silent = $false
$ie.navigate2("www.website.com/job1")
$ie.Visible = $true
while($ie.busy) {start-sleep 1}
# grab the table html
$ie.document.IHTMLDocument3_getElementsByTagName("div") | Where{ $_.className -eq 'job-template__wrapper' } | Out-file "C:UsersuserDesktopParse.txt"
$ie.Document.body.innerHTML | Out-file "C:UsersuserDesktopParse.html"
$ie.quit()
设法弄清楚了,可能不是最好的方法,但这里是代码:
# Counters
$i = 1
$page = 1
# Main loop : goes until you have x amount of job JobAds
# This isnt 100% accurate it will stop after the foreach loop below finishes,
# so you may end up with more than x but never less
while($i -le 2000)
{
# IE connection
$ie = New-Object -com InternetExplorer.Application
$ie.Visible = $true # false for silent run
$ie.silent = $false # false for silent run
$ie.navigate2("https://www.website.com.au/page?page=$page")
# wait until ie has finished
while($ie.busy) {start-sleep 1}
# Grab the 22 job links from the set seek page
$site = Invoke-WebRequest -Uri http://www.website.com.au/page
$site.Links.Href | Sort-Object | Get-Unique > C:UsersuserDesktoplinks.txt
$links = @(Get-Content C:UsersuserDesktoplinks.txt | Where-Object { $_ -like '*/job/*' })
# loop through each job link
foreach ( $link in $links )
{
# Connect to job site
$ie.navigate2("http://www.website.com.au" + $link)
while($ie.busy) {start-sleep 1}
# Download and copy to HTML
$ie.document.IHTMLDocument3_getElementsByTagName("div") | Where{ $_.className -eq 'job-template__wrapper' }
$ie.Document.body.innerHTML > "C:UsersuserDesktopweb_scrapescrape$i.html"
# Store in variable
$content= Get-Content "C:UsersuserDesktopweb_scrapescrape$i.html" | Out-String
# Remove header / footer
$start= $content.indexof('</style>') +8
$end= $content.indexof("</span>", $start)
$length =$end - $start
$content.substring($start, $length) | out-file "C:UsersuserDesktopweb_scrapescrape$i.html"
# Add html tags for word conversion
'<!DOCTYPE html PUBLIC >' + (Get-Content "C:UsersuserDesktopweb_scrapescrape$i.html" -Raw) | Set-Content "C:UsersuserDesktopweb_scrapescrape$i.html"
'<HTML>' + (Get-Content "C:Usersuserweb_scrapescrape$i.html" -Raw) | Set-Content "C:Usersuserweb_scrapescrape$i.html"
Add-Content -Path "C:UsersuserDesktopweb_scrapescrape$i.html" -Value '</HTML>'
# Set file variables
$htmlFile = ('C:Usersowain.esauDesktopweb_scrapescrape' + $i + '.html');
$docFile = ('C:Usersowain.esauDesktopweb_scrapewordscrape' + $i + '.docx');
# Convert html to word
htmlToWord $htmlFile $docFile
$i += 1
}
$page += 1
$ie.quit()
}