正在从网站下载文件-HTTPError:HTTP错误403:禁止



我正在尝试从EDGAR下载10Ks(上市公司的年度报告(。我正在运行下面的代码(从课本上用过,不太了解(,但一直收到以下错误:(在运行下面的程序之前,我下载了代码中描述的"master.idx"文件(。

HTTPError:HTTP错误403:禁止

你能帮我解决吗?

import urllib.request
import shutil
import os
import re
from pathlib import Path
def get_files(start_year:int, end_year:int,
reform:str, 
inddirect:str, odirect:str):
"""
Downloads SEC filings for specific companies
start_year -> First Year to download
end_year -> Last Year to download
reform -> Regex to specify forms to be downloaded
inddirect -> Directory containing index files
odirect -> Directory the filings will be downloaded to
"""
print('Downloading Filings')
# Regex to identify the form to download.
re_formtype = re.compile(reform, re.IGNORECASE)
# Regex to extract file name information 
# from a line
re_fullfilename = re.compile(r"|(edgar/data.*/([d-]+.txt))", re.IGNORECASE)
#loop through the index files based on year
for year in range(start_year, end_year+1):
#check whether the directory exists and create one 
# if it does not.
download_path = os.path.join(odirect, str(year))
if not os.path.exists(download_path):
os.makedirs(download_path)

for qtr in range(1,5):
#name of index file to be read.
dl_file = os.path.join(inddirect, 'master' + str(year) + str(qtr) + '.idx')

# check to see if the index file exists.
if not os.access(dl_file, os.R_OK):
# Download the index file if it does not 
# already exist
url='https://www.sec.gov/Archives/edgar/full-index/' + str(year) + '/' + 'QTR' + str(qtr) + '/master.idx'
# download the file defined as url and 
# download to the file defined a dl_fle.
urllib.request.urlretrieve(url, dl_file)
# open the index file
with open(dl_file, 'r') as f:
# set a counter called count to 1. Note 
# that the counter will only be incremented 
# after it downloads a file.
count=1

# loop through each line in the index file, 
# assigning to a variable called line
for line in f:
# Only download a file if the counter 
# is less than 5.
# Remove this if statement if you want
# to download all the files for the
# time period
if count<5:
# Check to see if the the line  
# matches the form type 
rematch=re.search(re_formtype,line)
#If there is a match then download 
# the filing
if rematch:
# The following line searches 
# for filename information. 
# The first grouping will 
# contain the location and 
# filename of the file to be 
# downloaded. The second
# grouping will contain just 
# the filename o
matches = re.search(re_fullfilename, line)
if matches:
# Construct the url to for 
# retrieving the filing 
url = str('https://www.sec.gov/Archives/') + str(matches.group(1))
# Create the filename to 
# download the file to.
outfile = os.path.join(download_path, 
str(matches.group(2)))
# Check to make sure the  
# file hasn't already 
# been downloaded

if not (os.path.isfile(outfile) and os.access(outfile, os.R_OK)):
# Print the name of the 
# file to be downloaded.
print("Downloading:"+str(outfile),end='n')
#downlaod the file
urllib.request.urlretrieve(url, outfile)    
count += 1
print('Downloading of Filings Complete',end='n')
return

# Specify, in regular expression format, the filing
# you are looking for.  Following is the for 10-k.
reform='(|10-?k(sb|sb40|405)?s*|)'
# Specify location of the index files.
inddirect = os.path.join(Path.home(), 'edgar', 'indexfiles')
# Specify where to download filings to
odirect = os.path.join(Path.home(), 'edgar', '10K')
# Execute the get filings function
get_files(2018, 2019, reform, inddirect, odirect)

替换您的行:

urllib.request.urlretrieve(url, dl_file)

使用这些行,因为您缺少用户代理

opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent', 'MyApp/1.0')]
urllib.request.install_opener(opener)
urllib.request.urlretrieve(url, dl_file)

相关内容

  • 没有找到相关文章

最新更新