使用BeautifulSoup的公司的问题解析列表



我可以用以下代码解析标准普尔500指数公司列表:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import xlwings as xw
def get_sp500_info():
resp = requests.get("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")
soup = BeautifulSoup(resp.text, 'lxml')
stocks_info = []
tickers = []
securities = []
gics_industries = []
gics_sub_industries = []
table = soup.find('table', {'class': 'wikitable sortable'})

for row in table.findAll('tr')[1:]:
ticker = row.findAll('td')[0].text
security = row.findAll('td')[1].text
gics_industry = row.findAll('td')[3].text
gics_sub_industry = row.findAll('td')[4].text

tickers.append(ticker.lower().replace(r"n", " "))
securities.append(security)
gics_industries.append(gics_industry.lower())
gics_sub_industries.append(gics_sub_industry.lower())

stocks_info.append(tickers)
stocks_info.append(securities)
stocks_info.append(gics_industries)
stocks_info.append(gics_sub_industries)

stocks_info_df = pd.DataFrame(stocks_info).T
stocks_info_df.columns=['tickers','security','gics_industry','gics_sub_industry']
stocks_info_df['seclabels'] = 'SP500'
return stocks_info_df
def open_in_excel(dataframe):
xw.view(dataframe)
if __name__ == "__main__":
open_in_excel(get_sp500_info())

现在我想用与上面基本相同的代码解析Russel3000公司列表。它不工作。

import requests
from bs4 import BeautifulSoup
import pandas as pd
import xlwings as xw
def get_russel3000_info():
resp = requests.get("https://www.ishares.com/us/products/239714/ishares-russell-3000-etf#holdings")
soup = BeautifulSoup(resp.text, "lxml")
stocks_info = []
tickers = []
securities = []
gics_industries = []

table = soup.find('table', {'class': 'display product-table border-row dataTable no-footer'})
for row in table.findAll('tr')[1:]:           #Line A
ticker = row.findAll('td')[0].text
security = row.findAll('td')[1].text
gics_industry = row.findAll('td')[2].text
tickers.append(ticker.lower().replace(r"n", " "))
securities.append(security)
gics_industries.append(gics_industry.lower())

stocks_info.append(tickers)
stocks_info.append(securities)
stocks_info.append(gics_industries)

stocks_info_df = pd.DataFrame(stocks_info).T
stocks_info_df.columns=['tickers','security','gics_industry']
stocks_info_df['seclabels'] = 'Russel3000'
return stocks_info_df
def open_in_excel(dataframe):
xw.view(dataframe)
if __name__ == "__main__":
open_in_excel(get_russel3000_info())

我不明白为什么它适用于s&p;P500而不适用于Russel3000。在"a行"我将得到以下错误:

Exception has occurred: AttributeError
'NoneType' object has no attribute 'findAll'

不应该返回"None"。我很感谢任何指针:-)

可以直接将表加载到pandas中:

df = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")

您可以访问具有df[0],df[1]等的页面上的表。在ishares.com的情况下,特定的表不加载,因为它是通过javascript本地加载。一种解决方案是使用Selenium来完成这项工作:

from selenium import webdriver
import pandas as pd
import time
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
url="https://www.ishares.com/us/products/239714/ishares-russell-3000-etf#holdings"
wd = webdriver.Chrome('chromedriver',options=options)
wd.get(url)
time.sleep(5) # sleep for a few seconds to allow loading the data
df = pd.read_html(wd.page_source)

df[7]是您正在查找的表:

权责发生制日期$560,367,328.565.165.60367e+084.38506e+06-$482,112,717.244.444.82113e+082.03475e+06-$362,479,373.963.62479e+08-$172,844,238.241.591.72844e+08-$168,815,957.221.551.68816 +08-

最新更新