在将数据写入excel时,如何将每列数据与网络抓取结果分离



当数据看起来像时,我知道如何将其分离

x, y, z

但当数据格式像时,我不知道该怎么做

Doe, John, BookName, Year, abstract with commas, links. 

这就是抓取后excel中数据的样子
这就是我想要的样子
这是我的代码

from unittest import result
import requests
from bs4 import BeautifulSoup
import csv
import urllib3.request
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

fakdep = '165'  
offset = input('Please enter number of offset:')
url = 'https://repositori.usu.ac.id/handle/123456789/{}?offset={}'.format(fakdep,offset)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
datas = []
count_page = 0
for page in range(1,2): 
count_page+=1
print('Scraping Offset No:', count_page)
result = requests.get(url+str(page), verify=False) 

soup = BeautifulSoup(result.text, 'html.parser')
items = soup.find_all('li','ds-artifact-item')
for it in items:
author = it.find('span','author h4').text
title = ''.join(it.find('a',href=True).text.strip().split('n'))
year = it.find('span','date').text
abstract = ''.join(it.find('div','artifact-abstract').text.strip().split('n'))
link = it.find('a')['href']
datas.append([author, title, year, abstract, link])
kepala = ['Author', 'Title', 'Year', 'Abstract', 'Link']
thewriter = csv.writer(open('results/{}_{}.csv'.format(fakdep,offset), 'w', newline=''))
thewriter.writerow(kepala)
for d in datas: thewriter.writerow(d)
这是我的建议。我需要知道一个偏移量才能测试它。

用分号分隔的CSV在Excel中要容易得多。

from unittest import result
import requests
from bs4 import BeautifulSoup
import csv
import urllib3.request
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

fakdep = '165'  
offset = input('Please enter number of offset:')
url = 'https://repositori.usu.ac.id/handle/123456789/{}?offset={}'.format(fakdep,offset)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
datas = []
count_page = 0
for page in range(1,2): 
count_page+=1
print('Scraping Offset No:', count_page)
result = requests.get(url+str(page), verify=False) 

soup = BeautifulSoup(result.text, 'html.parser')
items = soup.find_all('li','ds-artifact-item')
for it in items:
author = it.find('span','author h4').text
title = ''.join(it.find('a',href=True).text.strip().replace('/n', ''))
year = it.find('span','date').text
abstract = ''.join(it.find('div','artifact-abstract').text.strip().replace('/n', ''))
link = it.find('a')['href']
datas.append([author, title, year, abstract, link])
kepala = ['Author', 'Title', 'Year', 'Abstract', 'Link']
thewriter = csv.writer(open('results/{}_{}.csv'.format(fakdep,offset), 'w', newline=''), delimiter=";")
thewriter.writerow(kepala)
for d in datas: thewriter.writerow(d)

最新更新