本地工作的硒浏览器下载文件自动化代码在 AWS Linux 服务器中不起作用?



我正在尝试从客户端仪表板自动下载和处理某个csv文件。下面的代码适用于我的本地,无论是无头模式还是有头模式。我还可以在本地手动下载文件。但是,当我在 aws linux 服务器上运行代码时,我无法下载该文件。该文件很大,需要定期导入,因此我需要将其放入服务器中的工作 cron 中。任何人都可以浏览此代码并告诉我应该怎么做才能将文件下载到我的服务器中吗?

谢谢

/法典:

from selenium import webdriver
import re, unicodedata, time, datetime, arrow
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os, sys
import pandas as pd
print "### {} ###".format(arrow.now())
def browser_init(page_load_timeout=15):
chromeOptions = webdriver.ChromeOptions()
prefs = {"download.default_directory" : "/Users/deepak/Desktop/Adlmnt/"}
chromeOptions.add_experimental_option("prefs",prefs)
webdriver_paths = [
"/Users/deepak/Downloads/chromedriver_Mac2",
"/feed_log/pyenv/selenium/webdriver/chrome/chromedriver_3"
]
download_dir = [
"/Users/deepak/Desktop/Adlmnt/Ibv/ibv/discovery/working/Cdt_crawl/data",
"/feed_log/pyenv/selenium/webdriver/chrome/downloads/"
]
chromeOptions = webdriver.ChromeOptions()
prefs = {"download.default_directory" : download_dir}
chromeOptions.add_argument("--headless")
chromeOptions.add_argument("--disable-gpu")
chromeOptions.add_argument("--window-size=1920,1080")
chromeOptions.add_experimental_option("prefs",prefs)
dpath = ''
## find webdriver, open browser instance
for path_ in webdriver_paths:
try:
browser = webdriver.Chrome(executable_path=path_, chrome_options=chromeOptions, service_args=['--verbose', '--log-path=/tmp/chromedriver.log'])
browser.set_page_load_timeout(page_load_timeout)
browser.implicitly_wait(10)  # seconds
#browser.set_window_size(300, 500)
browser.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
dpath = download_dir[webdriver_paths.index(path_)]
params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior':
'allow', 'downloadPath': dpath}}
browser.execute("send_command", params)
print path_
print dpath
browse = browser
break
except Exception as e:
print "could not open browser: ", e
browse = None
continue
print browse
return browse, dpath

## login cdt
def cdt_init(browser,login_timeout=5):
browser.get("https://dashboard.cdt.com/#/login")
try:
user = WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.NAME, "username")))
finally:
user.send_keys("user_name")
browser.find_element_by_name("password").send_keys("pass_word")
browser.find_element_by_name("password").submit()
print "timing out for login"
time.sleep(login_timeout)
return browser
def cdt_labs(browser,dpath='/feed_log/pyenv/selenium/webdriver/chrome/downloads/', labs=None,dowload_timeout=15):
try:
labs = labs.format(arrow.now().format('YYYY-MM-DD'),arrow.now().format('YYYY-MM-DD'))
print "fetching link: ", labs
loaded = False
while not loaded:
try:
browser.get(labs)
loaded = True
except Exception as e:
print "could not load labs: ",e
loaded = False
print "retrying in a few moments..."
time.sleep(5)
print arrow.now().format('YYYY-MM-DD')
print browser.current_url
while labs!=browser.current_url:
browser.get(labs)
print "fetching link: ", labs
print "current  link: ", browser.current_url
time.sleep(3)
print "cdt labs opened"
try:
element = WebDriverWait(browser, 15).until(EC.element_to_be_clickable((By.XPATH,"//span[@class='ng-scope']")))
finally:
try:
time.sleep(3)
print element
print "clicking on export"
element.click()
try:
c = WebDriverWait(browser, 15).until(EC.element_to_be_clickable((By.XPATH,"//a[@class='labs-item-select ng-binding']")))
print c
print "waited for export, waited for export all"
finally:
try:
c.click()
except:
browser.find_elements(By.XPATH,"//a[@class='labs-item-select ng-binding']")[0].click()
print "wait for export all, not successful:", browser.find_elements(By.XPATH,"//a[@class='labs-item-select ng-binding']")
#c = browser.find_element(By.XPATH,"//a[@class='labs-item-select ng-binding']");
#print c
#print "clicking on export all data"
#c.click()
#time.sleep(dowload_timeout)
except:
print "clicking on export"
time.sleep(3)
browser.find_elements(By.XPATH,"//span[@class='ng-scope']")[0].click()
print "wait for export not successful: ", browser.find_elements(By.XPATH,"//span[@class='ng-scope']")
print "clicking on export all data"
try:
c = WebDriverWait(browser, 15).until(EC.element_to_be_clickable((By.XPATH,"//a[@class='labs-item-select ng-binding']")))
print c, "wait for export all successful"
finally:
try:
c.click()
except:
print "wait for export all unsuccessful", browser.find_elements(By.XPATH,"//a[@class='labs-item-select ng-binding']")
browser.find_elements(By.XPATH,"//a[@class='labs-item-select ng-binding']")[0].click()
time.sleep(dowload_timeout)
## checking/waiting for file to be downloaded
dirName = dpath
downloaded = False
download_d_checks = 6
while ((not downloaded) and (download_d_checks > 0)):
if not (os.path.exists(dirName) and os.path.isdir(dirName)):
print("Given Directory doesn't exists")
browser.quit()
sys.exit()
else:
if not os.listdir(dirName):
print("Directory is empty")
print "waiting a few moments..."
download_d_checks -= 1
time.sleep(5)
#browser.quit()
#sys.exit()
else:
downloaded = True
print("Directory is not empty")
file_ = os.listdir(dirName)[0]
print file_
print type(file_)
data = pd.read_csv(dirName+file_)
os.remove(dirName+file_)
print data.head(5)
## print browser.html
#print browser.execute_script("return document.documentElement.innerHTML;")
return browser, True
except Exception as e:
print "Error: ", e
raise e
#browser.quit()
return browser, False
browser,dpath = browser_init(page_load_timeout=15)
print "browser init done"
browser = cdt_init(browser)
print "cdt init done"
browser, download_status = cdt_labs(browser,dpath=dpath,labs="https://dashboard.cdt.com/#/reports/cdt-labs/{}/{}/0/6&3&7&15/r=desc/0",dowload_timeout=30)
if download_status:
print "file downloaded"
else:
print "file did not download"
browser.quit()
print arrow.now()

在服务器上运行时,我得到以下输出:

### 2018-05-27T13:57:41.028244+00:00 ###
could not open browser:  Message: 'chromedriver_Mac2' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home
/feed_log/pyenv/selenium/webdriver/chrome/chromedriver_3
/feed_log/pyenv/selenium/webdriver/chrome/downloads/
<selenium.webdriver.chrome.webdriver.WebDriver (session="4af93975ddc008717bd40286c930461a")>
browser init done
timing out for login
cdt init done
fetching link:  https://dashboard.cdt.com/#/reports/cdt-labs/2018-05-27/2018-05-27/0/6&3&7&15/r=desc/0
could not load labs:  Message: timeout
(Session info: headless chrome=64.0.3282.167)
(Driver info: chromedriver=2.36.540471 (9c759b81a907e70363c6312294d30b6ccccc2752),platform=Linux 3.13.0-87-generic x86_64)
retrying in a few moments...
2018-05-27
https://dashboard.cdt.com/#/reports/cdt-labs/2018-05-27/2018-05-27/0/6&3&7&15/r=desc/0
cdt labs opened
<selenium.webdriver.remote.webelement.WebElement (session="4af93975ddc008717bd40286c930461a", element="0.8633500363426354-1")>
clicking on export
<selenium.webdriver.remote.webelement.WebElement (session="4af93975ddc008717bd40286c930461a", element="0.8633500363426354-2")>
waited for export, waited for export all
Directory is empty
waiting a few moments...
Directory is empty
waiting a few moments...
Directory is empty
waiting a few moments...
Directory is empty
waiting a few moments...
Directory is empty
waiting a few moments...
Directory is empty
waiting a few moments...
file downloaded
2018-05-27T13:59:14.039865+00:00

python版本:Python 2.7.6

我最后收到消息作为文件下载 - 但不要混淆,它没有下载,这只是我保留的一条消息,以显示代码完全执行。 无论我添加多少超时,下载目录都保持为空。 PS:我不确定这是否重要,但我已经屏蔽了网址/客户端的名称。

可能是什么问题?

我有类似的问题,我解决了它,给下载目录写了权限。

download_default_directory = '/path/to/download_dir'
os.chmod(download_default_directory, 0777) # 0666 must be enought

相关内容

  • 没有找到相关文章

最新更新