如何使用Beautiful Soup Python使用字典在多个URL中循环



好的,我已经创建了一个web scraper。我对python还很陌生,所以如果你有其他建议给我,那就太棒了。

基本上,我的问题是,如何迭代每个字典项。基本上是url=(公共url)+(字典中的值)。我想遍历字典中的每个值,然后将其输出到csv。

只需注意:如果公司今天没有发布任何内容,那么就不会向CSV输出9月12日(今天),唯一应该输出任何内容的公司是"Ackroo":"00018766"。

感谢

from bs4 import BeautifulSoup
import urllib2
import time
import csv
links_list = []
other_list = []
time = time.strftime("%b %d %Y")

company_dict = {
    "Baytex Energy": "00031017",
    "Crescent Point Energy": "00028658",
    "Legacy Oil + Gas": "00023400",
    "Leucrotta Exploration": "00036173",
    "Lightstream Resources": "00028969",
    "Pengrowth Energy": "00031000",
    "Surge Energy": "00010447",
    "Ackroo": "00018766"
}
url = 'http://www.sedar.com/DisplayCompanyDocuments.do?lang=EN&issuerNo=%s' % (company_dict['Ackroo'])
sedar = urllib2.urlopen(url)
read_sedar = sedar.read()
soup = BeautifulSoup(read_sedar)


def print_list():
    tds = soup.find_all('td')
    for tr in soup.find_all('tr'):
        tds = tr.find_all('td')
        row = [elem.text.encode('utf-8') for elem in tds[0:len(tds)-1]]
        for x in row[4:]:
            other_list.append(row)
    return other_list

def get_links():
    html = 'http://www.sedar.com'
        for form in soup.find_all('form'):
        links_list.append(html + form.get('action'))
    return links_list
def write_to_output():
    text = print_list()
    get_links()
    text = [[s.strip() for s in inner] for inner in text[1:]]
    for a in text:
        a.remove('')
    i = 0
    with open('sedar.csv', 'wb') as outputfile:
        writer = csv.writer(outputfile)
        for b in text:
            if b[0] == time:
                writer.writerow(['Surge'])
                writer.writerow(text[i])
                writer.writerow([links_list[i]])
                writer.writerow('')
                i = i + 1
            else:
                print "done"
                break
write_to_output()

我已经对您的代码进行了一些重组,我认为它可以满足您的需求,但我还没有测试它,因为我没有安装Beautiful Soup。

company_dict = {
    "Baytex Energy": "00031017",
    "Crescent Point Energy": "00028658",
    "Legacy Oil + Gas": "00023400",
    "Leucrotta Exploration": "00036173",
    "Lightstream Resources": "00028969",
    "Pengrowth Energy": "00031000",
    "Surge Energy": "00010447",
    "Ackroo": "00018766"
}
base_url = 'http://www.sedar.com/DisplayCompanyDocuments.do'
def get_other(soup):
    other_list = []
    tds = soup.find_all('td')
    for tr in soup.find_all('tr'):
        tds = tr.find_all('td')
        row = [elem.text.encode('utf-8') for elem in tds[0:len(tds)-1]]
        for x in row[4:]:
            other_list.append(row)
    return other_list

def get_links(soup):
    links_list = []
    html = 'http://www.sedar.com'
    for form in soup.find_all('form'):
        links_list.append(html + form.get('action'))
    return links_list

def write_to_output(writer, today, other_list, links_list):
    text = [[s.strip() for s in inner] for inner in other_list[1:]]
    for a in text:
        a.remove('')
    i = 0
    for b in text:
        if b[0] == today:
            writer.writerow(['Surge'])
            writer.writerow(text[i])
            writer.writerow([links_list[i]])
            writer.writerow('')
            i = i + 1
        else:
            print "done"
            break

def main():
    today = time.strftime("%b %d %Y")
    with open('sedar.csv', 'wb') as outputfile:
        writer = csv.writer(outputfile)
        for company, issuerNo in company_dict.items():
            url = '%s?lang=EN&issuerNo=%s' % (base_url, issuerNo)
            print '%-25s : %s' % (company, url)
            sedar = urllib2.urlopen(url)
            read_sedar = sedar.read()
            soup = BeautifulSoup(read_sedar)
            other_list = get_other(soup)
            links_list = get_links(soup)
            write_to_output(writer, today, other_list, links_list)

if __name__ == '__main__':
    main()

如果你的问题归结为"我如何迭代字典项?"那么它是这样的:

for k,v in d.items():
   print "%s -> %s"%(k,v)

存在类似d.iteritems()d.viewitems()的替代方案,它们在某些情况下可能是优选的,但它们都可以如上所述使用。

最新更新