我一直在用python做一个刮网器来刮Google Finance,但是我找不到使用find()方法要找的特定标签。最后,我非常恼火,决定将返回的数据写入一个文件,然后自己查找。所以我把它写到同一目录下的testing.html中,并用Google Chromium打开它,这样我就可以使用检查工具了。几分钟之内,我就找到了我要找的元素。我做错了什么?我的代码附在下面:
import dryscrape
session = dryscrape.Session()
def get(url):
global session
try:
session.visit(url)
data = session.body()
except:
print('Connection Failed')
return str(data)
def save(price, stockname):
pass
def extract(data):
return data.find('<div class="YMLKec fxKbKc">')
class following():
apple = "https://www.google.com/finance/quote/AAPL:NASDAQ"
tesla = "https://www.google.com/finance/quote/TSLA:NASDAQ"
google = "https://www.google.com/finance/quote/GOOGL:NASDAQ"
amazon = "https://www.google.com/finance/quote/AMZN:NASDAQ"
microsoft = "https://www.google.com/finance/quote/MSFT:NASDAQ"
netflix = "https://www.google.com/finance/quote/NFLX:NASDAQ"
def __init__():
global apple
global tesla
global google
global amazon
global microsoft
global netflix
save(extract(get(following.apple)), following.apple)
save(extract(get(following.tesla)), following.tesla)
save(extract(get(following.google)), following.google)
save(extract(get(following.amazon)), following.amazon)
save(extract(get(following.microsoft)), following.microsoft)
save(extract(get(following.netflix)), following.netflix)
f = open("testing.html")
print(extract(f.read()))
f.close()
发现问题:不是YMLKec
而是YMlKec
。不是大写的L
data = open("testing.html", "r").read()
class_ = "YMlKec fxKbKc"
print(data.find(class_))
>>> 992880
为什么不尝试使用请求和BeautifulSoup库呢?以下是我的意思。
import requests
from bs4 import BeautifulSoup
class following():
def __init__(self):
self.session = requests.Session()
self.session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
def get_last_price(self,link):
r = self.session.get(link)
return BeautifulSoup(r.text,"lxml")
def extract(self,soup):
return soup.select_one("[data-exchange='NASDAQ']")['data-last-price']
if __name__ == '__main__':
base = "https://www.google.com/finance/quote/{}:NASDAQ"
scraper = following()
for ticker in ['AAPL','TSLA','GOOGL','AMZN','MSFT','NFLX']:
soup_object = scraper.get_last_price(base.format(ticker))
print(scraper.extract(soup_object))