注意:我的文本中有很多div,但我只想删除这个 特别是div 与所有的孩子
<div ng-if="comment.repliesCount&&showReplies" class="ng-scope">
<div
<div
</div>
</div>
</div>
from simplified_scrapy.simplified_doc import SimplifiedDoc
html='''
<div>
<div>test value</div>
<div ng-if="comment.repliesCount&&showReplies" class="ng-scope">
<div>
<div>
noise
</div>
</div>
</div>
</div>
'''
doc = SimplifiedDoc(html)
# if comment.repliesCount&&showReplies is unique, or first appears
html = doc.removeElement('div',attr='ng-if',value='comment.repliesCount&&showReplies')
# if ng-scope is unique, or first appears
html = doc.removeElement('div',attr='class',value='ng-scope')
# If none of the above works, try the following one. 'test value' is a string that can locate the div to be deleted
html = doc.removeElement('div',attr='class',value='ng-scope',start='test value')
print (html)
结果:<div><div>test value</div></div>
这里有一个例子:
import re
html='<div ng-if="comment.repliesCount&&showReplies" class="ng-scope"><div><div>HI !</div></div></div>'
def removehtml(html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', html)
return cleantext
print(removehtml(html))