这是我第一次使用panda,我正在尝试制作一个forloop,从网站中提取mp3链接并将其放入csv文件中。对于每个相册链接,它将创建一个新文件夹和一个新的csv文件,然后将mp3放入csv中。
一切正常但我有一个主要问题-数据帧不断将上一个循环中的数据附加到当前循环中。所以我的数据帧的list
越来越大。
以下是代码的样子:
from bs4 import BeautifulSoup
import urllib.request
import re
import pandas as pd
import os
import csv
def mpull():
albumlist()
baseurl = "https://downloads.khinsider.com"
alist = albumlist.albums_str
llist = albumlist.link_str
fullsoup = []
for l, ab in zip(llist, alist):
os.mkdir(ab)
url = urllib.request.urlopen(l)
content = url.read()
soup = BeautifulSoup(content, features="html.parser")
for a in soup.findAll('a',href=re.compile('/*.mp3')):
df = pd.DataFrame([])
fullsoup.append(baseurl+a['href'])
remove_dup(fullsoup)
df = pd.DataFrame(fullsoup)
df.to_csv(ab+"/"+ab+".csv", index=False, header=False)
print(fullsoup)
mpull()
我想要的是:
007 everything or nothing:
https://downloads.khinsider.com/game-soundtracks/album/007-everything-or-nothing/EON-01-James-Bond-Theme.mp3
https://downloads.khinsider.com/game-soundtracks/album/007-everything-or-nothing/EON-02-Russian-Liar.mp3
#MORE 007 everything or nothing songs
我得到的是:
007 everything or nothing:
#songs from the last loop appear first for some reason
https://downloads.khinsider.com/game-soundtracks/album/007-blood-stone/01-%2520James%2520Bond-Blood%2520Stone%2520Theme%2520Song.mp3
https://downloads.khinsider.com/game-soundtracks/album/007-blood-stone/02-%2520M%2520Puts%2520Her%2520Trust%2520in%2520Bond.mp3
#Then the right songs appear afterwards
https://downloads.khinsider.com/game-soundtracks/album/007-everything-or-nothing/EON-01-James-Bond-Theme.mp3
https://downloads.khinsider.com/game-soundtracks/album/007-everything-or-nothing/EON-02-Russian-Liar.mp3
#MORE 007 everything or nothing songs
我尝试过的:我尝试将del df
添加到循环的末尾,如下所示:
from bs4 import BeautifulSoup
import urllib.request
import re
import pandas as pd
import os
import csv
def mpull():
albumlist()
baseurl = "https://downloads.khinsider.com"
alist = albumlist.albums_str
llist = albumlist.link_str
fullsoup = []
for l, ab in zip(llist, alist):
os.mkdir(ab)
url = urllib.request.urlopen(l)
content = url.read()
soup = BeautifulSoup(content, features="html.parser")
for a in soup.findAll('a',href=re.compile('/*.mp3')):
df = pd.DataFrame([])
fullsoup.append(baseurl+a['href'])
remove_dup(fullsoup)
df = pd.DataFrame(fullsoup)
df.to_csv(ab+"/"+ab+".csv", index=False, header=False)
del def
print(fullsoup)
mpull()
但这似乎没有起到任何作用——它仍然将最后一个循环的数据帧附加到当前的csv迭代中。
任何想法都会很棒。谢谢
我想通了!!!而不是尝试删除数据帧df
。我需要在每个循环迭代中删除fullsoup
列表,这样它就不会在每个循环中保留列表的数据。
def mpull():
albumlist()
baseurl = "https://downloads.khinsider.com"
alist = albumlist.albums_str
llist = albumlist.link_str
for l, ab in zip(llist, alist):
fullsoup = []
os.mkdir(ab)
url = urllib.request.urlopen(l)
content = url.read()
soup = BeautifulSoup(content, features="html.parser")
for a in soup.findAll('a',href=re.compile('/*.mp3')):
df = pd.DataFrame([])
fullsoup.append(baseurl+a['href'])
# remove_dup(fullsoup)
df = pd.DataFrame(fullsoup)
print(fullsoup)
del fullsoup
df.to_csv(ab+"/"+ab+".csv", index=False, header=False)
mpull()