比较两个HTML页面的Python内容

我想借助Python脚本比较两个HTML文件的内容。我想在简单的文本格式的差异。我怎样才能得到它呢?这些文件中也可能有表我如何得到这些表之间的差异?

你可以查看HOW TO: Compare content on 2 HTML files with Python website作为参考。

import os
from bs4 import BeautifulSoup
def getfiles(path1, path2):
#Load files on root of path1 on files1
for root, dir, names in os.walk(path1):
files1 = names
break #Will break the for to read just the root folder
#Load files on root of path2 on files2
for root, dir, names in os.walk(path2):
files2 = names
break #Will break the for to read just the root folder

#Compares the two list of files and select files with the same name on both paths
interfiles = set(files1).intersection(files2)
#Select just HTML files on mylist
mylist = [ fi for fi in interfiles if fi.endswith('.html')]

print 'nI will check:', len(mylist), 'files in total... HOLD ON!n'
return mylist

def checkcontent(path1, path2):
#Get files from both paths
mylist = getfiles(path1, path2)
difcontent = 0
diftitles = 0
titles = []
notitles = []
print '='*50
print 'Files With Different Content'
print '='*50

for files in mylist:
#Select files on path1 and add them to the sooup
htmlDoc = open (path1+files, 'r+')
soup1 = BeautifulSoup(htmlDoc.read())
#Select div class description inside div class bodytext
find1 = soup1.select('.bodytext .description')
#Select H3 tags
header1 = soup1.h3
#Select files on path2 and add them to the sopu
htmlDoc = open (path2+files, 'r+')
soup2 = BeautifulSoup(htmlDoc.read())
#Select div class description inside div class bodytext
find2 = soup2.select('.bodytext .description')
#Select H1 tag
header2 = soup2.h1
#Check if the are H1 and H3 tags
if (header2 == None or header1 == None):
notitles.append(files)
#Compares headers
else:
for headers in header1:
h1 = headers
for headers2 in header2:
h3 = headers2
if not h1 == h3:
titles.append(files)
diftitles += 1
#Read lines on HTML files
for lines1 in find1:
l = lines1
for lines2 in find2:
n = lines2
#Compares content
if not l == n:
print files
difcontent += 1
#Print results
print 'n'
print '='*50
print 'Files With No Title'
print '='*50
for lines in notitles:
print lines
print 'n'
print '='*50
print 'Files With Different Titles'
print '='*50
for lines in titles:
print lines
print "nI've found", difcontent, 'files with different content'
print "I've found", diftitles, 'different titles'
def main():
mypath = "PATH_TO_FOLDER1"
mypath2 = "PATH_TO_FOLDER2"
checkcontent(mypath, mypath2)

if __name__ == "__main__":
main()

您所需要做的就是解析html，为此，您可以使用Python的Beautiful Soup库。如果你提供了你的代码，我可以给你一个可行的代码。

由于HTML实际上是一个XML文件，因此您需要做的是比较两个XML文件。我强烈推荐xmldiff。下面是我从W3上截取的两个HTML示例。

from xmldiff import main, formatting 
diff = main.diff_files('a.html', 'b.html', formatter=formatting.XMLFormatter())

A.HTML

& lt; !DOCTYPE html>
& lt; html>
& lt; head>
& lt; title>页面Title
& lt;/head>
& lt; body>

& lt; h1>这是一个Heading
& lt; p>这是一个段落。;/p>

& lt;/body>
& lt;/html>

B.HTML

& lt; !DOCTYPE html>
& lt; html>
& lt; head>
& lt; title>不同的Title
& lt;/head>
& lt; body>

& lt; h1>这是一个Heading
& lt; p>这是一个段落。;/p>

& lt;/body>
& lt;/html>

反应:
& lt; !DOCTYPE html>

PagedifferentTitle
& lt;/head>
& lt; body>

& lt; h1>这是一个Heading
& lt; p>这是一个段落。;/p>

& lt;/body>
& lt;/html>

相关内容

最新更新

热门标签：