好的,我已经创建了一个web scraper。我对python还很陌生,所以如果你有其他建议给我,那就太棒了。
基本上,我的问题是,如何迭代每个字典项。基本上是url=(公共url)+(字典中的值)。我想遍历字典中的每个值,然后将其输出到csv。
只需注意:如果公司今天没有发布任何内容,那么就不会向CSV输出9月12日(今天),唯一应该输出任何内容的公司是"Ackroo":"00018766"。
感谢
from bs4 import BeautifulSoup
import urllib2
import time
import csv
links_list = []
other_list = []
time = time.strftime("%b %d %Y")
company_dict = {
"Baytex Energy": "00031017",
"Crescent Point Energy": "00028658",
"Legacy Oil + Gas": "00023400",
"Leucrotta Exploration": "00036173",
"Lightstream Resources": "00028969",
"Pengrowth Energy": "00031000",
"Surge Energy": "00010447",
"Ackroo": "00018766"
}
url = 'http://www.sedar.com/DisplayCompanyDocuments.do?lang=EN&issuerNo=%s' % (company_dict['Ackroo'])
sedar = urllib2.urlopen(url)
read_sedar = sedar.read()
soup = BeautifulSoup(read_sedar)
def print_list():
tds = soup.find_all('td')
for tr in soup.find_all('tr'):
tds = tr.find_all('td')
row = [elem.text.encode('utf-8') for elem in tds[0:len(tds)-1]]
for x in row[4:]:
other_list.append(row)
return other_list
def get_links():
html = 'http://www.sedar.com'
for form in soup.find_all('form'):
links_list.append(html + form.get('action'))
return links_list
def write_to_output():
text = print_list()
get_links()
text = [[s.strip() for s in inner] for inner in text[1:]]
for a in text:
a.remove('')
i = 0
with open('sedar.csv', 'wb') as outputfile:
writer = csv.writer(outputfile)
for b in text:
if b[0] == time:
writer.writerow(['Surge'])
writer.writerow(text[i])
writer.writerow([links_list[i]])
writer.writerow('')
i = i + 1
else:
print "done"
break
write_to_output()
我已经对您的代码进行了一些重组,我认为它可以满足您的需求,但我还没有测试它,因为我没有安装Beautiful Soup。
company_dict = {
"Baytex Energy": "00031017",
"Crescent Point Energy": "00028658",
"Legacy Oil + Gas": "00023400",
"Leucrotta Exploration": "00036173",
"Lightstream Resources": "00028969",
"Pengrowth Energy": "00031000",
"Surge Energy": "00010447",
"Ackroo": "00018766"
}
base_url = 'http://www.sedar.com/DisplayCompanyDocuments.do'
def get_other(soup):
other_list = []
tds = soup.find_all('td')
for tr in soup.find_all('tr'):
tds = tr.find_all('td')
row = [elem.text.encode('utf-8') for elem in tds[0:len(tds)-1]]
for x in row[4:]:
other_list.append(row)
return other_list
def get_links(soup):
links_list = []
html = 'http://www.sedar.com'
for form in soup.find_all('form'):
links_list.append(html + form.get('action'))
return links_list
def write_to_output(writer, today, other_list, links_list):
text = [[s.strip() for s in inner] for inner in other_list[1:]]
for a in text:
a.remove('')
i = 0
for b in text:
if b[0] == today:
writer.writerow(['Surge'])
writer.writerow(text[i])
writer.writerow([links_list[i]])
writer.writerow('')
i = i + 1
else:
print "done"
break
def main():
today = time.strftime("%b %d %Y")
with open('sedar.csv', 'wb') as outputfile:
writer = csv.writer(outputfile)
for company, issuerNo in company_dict.items():
url = '%s?lang=EN&issuerNo=%s' % (base_url, issuerNo)
print '%-25s : %s' % (company, url)
sedar = urllib2.urlopen(url)
read_sedar = sedar.read()
soup = BeautifulSoup(read_sedar)
other_list = get_other(soup)
links_list = get_links(soup)
write_to_output(writer, today, other_list, links_list)
if __name__ == '__main__':
main()
如果你的问题归结为"我如何迭代字典项?"那么它是这样的:
for k,v in d.items():
print "%s -> %s"%(k,v)
存在类似d.iteritems()
和d.viewitems()
的替代方案,它们在某些情况下可能是优选的,但它们都可以如上所述使用。