Python从xml中删除标记br和其他标记



我正在拆分此链接的XMLhttps://timesofindia.indiatimes.com/toirssfeed/-2128936835.cms根据标题转换成许多xml

#Python code to illustrate parsing of XML files
# importing the required modules
import requests
import xml.tree.ElementTree as ET
def loadRSS():

# url of rss feed
url = "https://timesofindia.indiatimes.com/toirssfeed/-2128936835.cms"

# creating HTTP response object from given url
resp = requests.get(url)

# saving the xml file
with open('topnewsfeed.xml', 'wb') as f:
f.write(resp.content)

def wire_xml(filename):
context = ET.iterparse(filename, events=('end', ))
for event, elem in context:
if elem.tag == 'article':
title = elem.find('headline').text
out_filename = format(title + ".xml")
with open('./xml/'+out_filename, 'wb') as f:
# f.write(("<?xml version="1.0" encoding="UTF-8"?>n"))
f.write(ET.tostring(elem))  

def main():
# load rss from web to update existing xml file
loadRSS()

# store news items in a xml file
wire_xml('topnewsfeed.xml') 

if __name__ == "__main__":

# calling main function
main()

上面的代码正在工作,但他们有一个2问题

1.xml中的内容(文本(有无用的标记如何删除此阶段示例:-

<content><div class="section1"><div class="Normal">HYDERABAD: Bharat Biotech on Friday said it has committed to supply over 500 million doses of its Covid-19 vaccine Covaxin to the Centre under the countrywide immunisation programme.<br/><br/>Speaking at a virtual conference organised by the Confederation of Indian Industry, Suchitra Ella, joint Managing Director of the city-based vaccine maker, said the company's facilities in four cities - Hyderabad, Bengaluru, Pune, and Ankaleshwar - are currently producing Covaxin.<br/><br/>"

2.如何根据我的要求更改标签示例:-

<?xml version="1.0" encoding="UTF-8"?>
-<nitf>

-<head>
<title>Ukraine Black Sea ports resume grain operations</title>

-<iim ver="3">
<ds value="" num="1:20"/>
<ds value="Reuter" num="1:30"/>
<ds value="" num="1:40"/>
<ds value="REU" num="1:50"/>
<ds value="20210723" num="1:70"/>
<ds value="055600+0000" num="1:80"/>
<ds value="Reuter.2021-07-23T055600Z_528892025_L1N2OZ07W_RTRMADT_0_GRAINS-UKRAINE-PORTS.XML" num="2:05"/>
<ds value="" num="2:07"/>
<ds value="3" num="2:10"/>
<ds value="OEC" num="2:15"/>
<ds value="" num="2:20"/>
<ds value="" num="2:22"/>
<ds value="GRAINS-UKRAINE/PORTS" num="2:25"/>
<ds value="" num="2:50"/>
<ds value="20210723" num="2:55"/>
<ds value="" num="2:80"/>
<ds value="" num="2:85"/>
<ds value="" num="2:90"/>
<ds value="" num="2:95"/>
<ds value="" num="2:101"/>
<ds value="Ukraine Black Sea ports resume grain operations" num="2:105"/>
<ds value="Reuter" num="2:110"/>
<ds value="Reuter" num="2:115"/>
<ds value="KYIV, July 23 (Reuters) - All Ukraine major Black Sea ports are working in normal mode, resuming operations affected by poor weather on Thursday, the state seaport authority said on Friday." num="2:120"/>
</iim>
</head>

-<body>

-<body.content>
<p>KYIV, July 23 (Reuters) - All Ukraine major Black Sea ports are working in normal mode, resuming operations affected by poor weather on Thursday, the state seaport authority said on Friday.</p>
<p>The restrictions of grain-loading operations had applied to the ports of Odesa, Chornomorsk, Mykolayiv, and Pivdeny.</p>
<p>Ukraine is among the world's biggest global grain exporters and plans to ship about 56 million tonnes of grain in the 2021/22 season. (Reporting by Pavel Polityuk)</p>
</body.content>
</body>
</nitf>

我不会在上面的甲酸中保存所有xml

用于说明xml文件解析的python代码
# importing the required modules
import re as re
import xml.etree.ElementTree as ET
import pandas as pd
from urllib.request import Request, urlopen
import configparser
import os
def loadRSS():
try:
# url of rss feed
url = "https://timesofindia.indiatimes.com/toirssfeed/-2128936835.cms"
# creating HTTP response object from given url
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
#saving the url data in xml byte form
web_byte = urlopen(req)
#return data 
return web_byte
except OSError as e:
print("Error in connecting TIL site :- ",e)
input("Press andy to Close")
def parseXML(xmlfile):    
news=[]
# create element tree object
tree = ET.parse(xmlfile)
# get root element
root = tree.getroot()
# iterate through each node of the tree
for node in root: 
s_article  = node.attrib.get("ID")
s_headline = node.find("headline").text
s_imagename = node.find("imagename").text
s_content = node.find("content").text
s_summary = node.find("summary").text
s_caption = node.find("caption").text
s_cats = node.find("cats").text
#update data in news list 
news.append({"ID": s_article, "headline": s_headline, 
"imagename": s_imagename, "content": s_content,
"summary": s_summary, "caption": s_caption,"cats":s_cats})
#return data in form of list 
return news
def savetodf(newsitems):
#difining Data fram columns 
df_cols  = ['ID','headline', 'imagename', 'content', 'caption', 'summary','cats']
#making data fram 
out_df = pd.DataFrame(newsitems, columns = df_cols)
#removing unwanted chrater form content
out_df['content']=out_df['content'].apply(lambda cw :re.sub('<.*?>','',cw))
#returning data frame 
return out_df
def define_filename(filename):
#Defining file name of each news which save in xml 
config = configparser.ConfigParser()
config.read('path.ini')
for section_name in config.sections():
for name, value in config.items(section_name):
if name=='default_path':
default_path=value
file_formate="xml"
return os.path.join(default_path,filename + "." + file_formate)
def build_item_xml(row):
#defining new xml as per CCI sturctuer
items = ET.Element('nitf')
#defining Head and other attributes 
head = ET.SubElement(items,'head')
title = ET.SubElement(head,'title')
title.text=row["headline"]
country=ET.SubElement(head,'cats')
country.text=row["cats"]
item1=ET.SubElement(head,'iim', ver='3')
ET.SubElement(item1, 'ds num="1:20"', value="79")
#This is important attribute to Import in CCI
ET.SubElement(item1, 'ds num="1:30"',value="TOIOnline")
#   ET.SubElement(item1, 'ds num="1:80"',value="113052+0000")
#   ET.SubElement(item1, 'ds num="2:10"',value="3")
ET.SubElement(item1, 'ds num="2:20"',value="TOIOnline")
ET.SubElement(item1, 'ds num="2:25"',value=row["headline"])
ET.SubElement(item1, 'ds num="2:105"',value=row["headline"])
#savine content in body of xml
body=ET.SubElement(items, 'body')
content= ET.SubElement(body, 'body.content')
content.text=row["content"]
tree = ET.ElementTree(items)
#riting in XMl 
tree.write(define_filename(row['ID']),encoding='utf-8',xml_declaration=True)
#returning in form of row 
return row  
def main():
# load rss from web to update existing xml file
lodrss=loadRSS()
# parse xml file
newsitems = parseXML(lodrss)
# store news items in a datafram|
df=savetodf(newsitems)
# this calls build_item_xml per row
df.apply(build_item_xml, axis=1)

if __name__ == "__main__":
#     pd.set_option('display.max_colwidth', -1)
# calling main function
main()

1.xml中的内容(文本(有无用的标记如何删除此阶段示例:-

更好的方法是将输入提要保存在数据帧中。然后你可以删除你的标签

#removing unwanted chrater form content
out_df['content']=out_df['content'].apply(lambda cw :re.sub('<.*?>','',cw))

2.如何根据我的要求更改标签示例:-

您需要从下面的数据框中创建一个新的XML文件,并将其保存在名为的不同XML中

def build_item_xml(row):
#defining new xml as per CCI sturctuer
items = ET.Element('nitf')
#defining Head and other attributes 
head = ET.SubElement(items,'head')
title = ET.SubElement(head,'title')
title.text=row["headline"]
country=ET.SubElement(head,'cats')
country.text=row["cats"]
item1=ET.SubElement(head,'iim', ver='3')
ET.SubElement(item1, 'ds num="1:20"', value="79")
#This is important attribute to Import in CCI
ET.SubElement(item1, 'ds num="1:30"',value="TOIOnline")
#   ET.SubElement(item1, 'ds num="1:80"',value="113052+0000")
#   ET.SubElement(item1, 'ds num="2:10"',value="3")
ET.SubElement(item1, 'ds num="2:20"',value="TOIOnline")
ET.SubElement(item1, 'ds num="2:25"',value=row["headline"])
ET.SubElement(item1, 'ds num="2:105"',value=row["headline"])
#savine content in body of xml
body=ET.SubElement(items, 'body')
content= ET.SubElement(body, 'body.content')
content.text=row["content"]
tree = ET.ElementTree(items)
#riting in XMl 
tree.write(define_filename(row['ID']),encoding='utf-8',xml_declaration=True)
#returning in form of row 
return row  

相关内容

最新更新