python将所有图像从网页下载到一个文件中



我想从这个网页下载所有图像,但我的代码不可用。考虑到这个剧本,我该怎么修改呢。

import requests,os
from bs4 import BeautifulSoup
from urllib.request import urlopen
html=requests.get('https://www.dreamstime.com/free-results.php?securitycheck=afbb79db0e7e374867295876228b135a&firstvalue=&lastsearchvalue=&srh_field=doges&searchby=doges&s_free=y&s_cc0=y',headers={"User-Agent": "XY"})
html.encoding='utf-8'
sp=BeautifulSoup(html.text,'html.parser')
images_dir="images/"
if not os.path.exists(images_dir):
os.mkdir(images_dir)
all_links=sp.find_all(['a','img'])
for link in all_links:
src=link.get("src")
href=link.get("href")
attrs=[src,href]
for attr in attrs:
if attr != None and ('.jpg' in attr or '.png' in attr):
full_path=attr
filename=full_path.split('/')[-1]
print(full_path)

try:
image=urlopen(full_path)
f=open(os.path.join(images_dir,filename),'wb')
f.write(image.read())
f.close()
except:
print("{} fail".format(filename))
import os
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = requests.get('https://www.google.de/?gws_rd=ssl', headers={"User-Agent": "XY"})
my_domain = "https://www.google.de"
# your problem is that the links are relative and do not have the domain name.
# You should check if http is present in the link,
# if not check if there  is a base element "<base href="https://example.com">"
# if there is add that to the URI, if not add the "my_domain"
# I cheated here and just added the my_domain directly, which will cause you issues if it is present.
html.encoding = 'utf-8'
sp = BeautifulSoup(html.text, 'html.parser')
images_dir = "images/"
# print(f"My images path {os.path.exists(images_dir)}")
if not os.path.exists(images_dir):
print(f"was not found {os.path(images_dir)}")
os.mkdir(images_dir)
all_links = sp.find_all(['a', 'img'])
# print(all_links)
for link in all_links:
src = link.get("src")
# print(f"src= {src}")
href = link.get("href")
# print(f"href= {href}")
attrs = [src, href]
# print(f"attrs= {attrs}")
for attr in attrs:
if attr and ('.jpg' in attr or '.png' in attr):
my_path = attr
full_path = my_domain + my_path
# print(full_path)
filename = my_path.split('/')[-1]
print(f"My image ref -> {my_path}")
print(f"My full image ref -> {full_path}")
print(f"My image filename-> {filename}")
try:
image = urlopen(full_path)
f = open(os.path.join(images_dir, filename), 'wb')
f.write(image.read())
f.close()
except NameError:
print("{} fail".format(filename))

我制作了一个新版本,它与站点captchas的配合效果更好。首先,我尝试了一个useragent头,但网站忽略了这一点。如果你愿意,你可以先读出一个网站的cookie,然后再添加它。以下是可以根据需要添加的样本,以及您在网站上收集的数据。#";cookie":&quot__gads=ID=b2281f3501a53……";#";用户代理":"Mozilla/5.0(Windows NT 10.0;Win6……">

我必须强调,这是一次智力锻炼,我永远不会接受我不拥有的内容,或者只允许在特定的背景下使用。我认为这段代码是一种简单的方法,可以分析特定标签的问题,或者盘点我使用的标签。

import os
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
my_domain_short = "www.google.de"
my_domain = f"https://{my_domain_short}"
# This tells the domain I am a human not a robot, to avoid Captcha's
html = requests.get(my_domain,
headers={
"Accept": "application/signed-exchange;v=b3;q=0.7,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-GB,en;q=0.9,en-US;q=0.8,de;q=0.7,nl;q=0.6",
"Host": my_domain_short,
"Purpose": "prefetch",
"Referer": my_domain,
})
html.encoding = 'utf-8'
sp = BeautifulSoup(html.text, 'html.parser')
tag_base = sp.base
print(tag_base)
# This is where I want to put my images locally on my machine.
images_dir = "images/"
print(f"My images path {os.path.exists(images_dir)}")
# if the images path does not exist make it.
if not os.path.exists(images_dir):
print(f"was not found {os.path(images_dir)}")
os.mkdir(images_dir)
# get all <a img=src tags>
all_links = sp.find_all(['a', 'img'])
# is there a Base href tag for the page suggesting relative URLs
base_path = sp.find('base href')
print(f"My base Path: {base_path}")

# print(all_links) # Print all found results, by removing comment "#"
# loop through results and get the pictures
for link in all_links:
src = link.get("src")
# print(f"src= {src}") # tests code
href = link.get("href")
# print(f"href= {href}") # tests code
attrs = [src, href]
# print(f"attrs= {attrs}") # tests code
for attr in attrs:
if attr and ('.jpg' in attr or '.png' in attr):
my_path = attr
print(f"My image ref -> {my_path}")
# If the URI is Absolute, then ok URI is good, otherwise add the base path or my_path to it.
if "http" in my_path:
full_path = my_path
else:
full_path = my_domain + my_path
print(full_path)
filename = my_path.split('/')[-1]
print(f"My full image ref -> {full_path}")
print(f"My image filename-> {filename}")
# Try and write the file to my local computer
try:
image = urlopen(full_path)
f = open(os.path.join(images_dir, filename), 'wb')
f.write(image.read())
f.close()
except NameError:
print("{} fail".format(filename))

这是另一个更简单的版本,可以作为导入运行,也可以从控制台或脚本运行,在这种情况下,这个版本只列出了robots.txt的内容作为示例。

import sys
from urllib.request import urlopen

def get_content(url):
story = urlopen(url)
story_words = []
for line in story:
line_words = line.decode("utf8").split()
for word in line_words:
story_words.append(word)
story.close()
return story_words  # if called through import words -> get_content() returns values

# how to stop function being used when importing, how to list imported functions
def print_content(story_words):
"""
prints an input to the console, can be used in import
"""
for word in story_words:
print(word)

def main(url):
"""calls a function and calls a print function to print values
Args: url: The URL of a UTF-8 text document.
Returns: A list of strings from website.
"""
# url = sys.argv[1] done in main instead for command line argument so that it is not run when importing
words = get_content(url)  # calls fetch words, which returns values
print_content(words)  # passes words to print function and prints returned values

if __name__ == '__main__':  # run as a script or if imported is ignored and
# if called through import words -> get_content() returns values
#  .script.py https://www.google.com/robots.txt
try:
main(sys.argv[1])
# this takes the input from console after command # I could have just called url and in main used url = sys.argv[1]
# if the file is run directly then it just passes the url here.
except IndexError:
print("list index out of range")
print('Module not run from command line passing ".script.py https://www.google.com/robots.txt"')
main("https://www.google.com/robots.txt")

相关内容

最新更新