Python:Webdriver获取URL列表,重复数据



我正试图从URL中提取历史数据。日期(epcho时间(是URL的一部分。

import pandas as pd
import numpy as np
from selenium import webdriver
import chromedriver_binary
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import time
from datetime import datetime
options = Options()
options.headless = True
lastDate = '2021-07-01'
firstDate = '2010-01-01'
time_object = time.strptime(lastDate, '%Y-%m-%d')
period2 = int(time.mktime(time_object))
period1 = int(period2 - 86400*200)
time_object = time.strptime(firstDate, '%Y-%m-%d')
period0 = time.mktime(time_object)
count = 1
url=f"https://finance.yahoo.com/quote/%5EGSPC/history?period1={period1}&period2={period2}&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true"
#url=r'https://finance.yahoo.com/quote/%5EGSPC/history?period1=1262304000&period2=1625097600&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true'
while period2 >= period0:
ed = datetime.fromtimestamp(period2)
sd = datetime.fromtimestamp(period1)
print(f"Working on {sd} {ed}, current count {count}")
print(f"URL is {url}")
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(20)
driver.get(url)
js = "var q=document.documentElement.scrollTop=100000" 
driver.execute_script(js)

for row in t.tbody.findAll('tr'):
date = row.findAll('td')[0].text
date = datetime.strptime(date, "%b %d, %Y")
date = date.strftime("%Y-%m-%d")
open = row.findAll('td')[1].text.replace(',','')
high = row.findAll('td')[2].text.replace(',','')
low = row.findAll('td')[3].text.replace(',','')
close = row.findAll('td')[4].text.replace(',','')
adjclose = row.findAll('td')[5].text.replace(',','')
volume = row.findAll('td')[6].text.replace(',','')
hist = pd.DataFrame([[date,open,high,low,close,adjclose,volume]], columns=['Date', 'Open','High','Low','Close', 'Adj Close', 'Volumn'])
if count == 1:
hist.to_csv('hist.csv', index=False, header=True)
else:
hist.to_csv('hist.csv', index=False, mode='a', header=False)  

count = count + 1    
period2 = int(period1)
period1 = int(period2 - 86400*200)
url=f"https://finance.yahoo.com/quote/%5EGSPC/history?period1={period1}&period2={period2}&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true"
driver.close()

我打印了URL,它用新刷新的时间段正确更新了。但是,写入我的hist.csv的内容是重复的。似乎驱动程序只尊重我的第一个URL,而完全忽略了其余的。因此,我在历史.csv 中重复了第一段日期/价格等

如果您有任何意见,不胜感激。感谢

请忽略-我刚刚意识到我在使用jupyter时没有刷新变量。我在发布问题2分钟后就发现了这个问题。感谢伟大的stackoverflow!

最新更新