(Django)代码在本地运行良好,但在生产中的第二次迭代后似乎停止了



我正在使用Selenium(Firefox(抓取网站上的数据。我创建了一个带有一些按钮的UI,比如";开始刮擦";以及";停止刮擦"当有权访问UI的人点击其中一个按钮时,它会调用Django Rest Framework API来启动或停止抓取。

在当地,一切都很好,似乎没有问题。然而,在生产过程中,刮擦总是只到达要收集的第二个项目。这是代码:

# api/views.py
r = redis.Redis(host='localhost', port=6379, db=0, decode_responses='utf-8')
@api_view(['POST'])
@permission_classes((IsAuthenticated,))
@ensure_csrf_cookie
def scrape_website(request):
if request.method == 'POST':
session_key = request.session.session_key
r.set('scrape-stop-%s' % (session_key), 'no')
r.expire('scrape-stop-%s' % (session_key), 60000)
data = request.POST
location = data.get('location')
company_type = data.get('company_type')
limit = data.get('limit')
scrape(session_key, location, company_type)
return Response({'msg': 'scrape successfully started'})
# scrape.py
def scrape(session_key=None, location='united_states', agency_type='digital_marketing_agencies', limit=200):
options = Options()
options.headless = True
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.dir", os.path.join(base_dir, 'reports'))
profile.set_preference("browser.download.folderList", 2)
profile.set_preference("browser.helperApps.alwaysAsk.force", False);
profile.set_preference("browser.download.manager.showAlertOnComplete", False)
profile.set_preference("browser.download.manager.showWhenStarting", False)
profile.set_preference('browser.helperApps.neverAsk.saveToDisk','application/zip,application/octet-stream,application/x-zip-compressed,multipart/x-zip,application/x-rar-compressed, application/octet-stream,application/msword,application/vnd.ms-word.document.macroEnabled.12,application/vnd.openxmlformats-officedocument.wordprocessingml.document,application/vnd.ms-excel,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/vnd.openxmlformats-officedocument.wordprocessingml.document,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/rtf,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/vnd.ms-excel,application/vnd.ms-word.document.macroEnabled.12,application/vnd.openxmlformats-officedocument.wordprocessingml.document,application/xls,application/msword,text/csv,application/vnd.ms-excel.sheet.binary.macroEnabled.12,text/plain,text/csv/xls/xlsb,application/csv,application/download,application/vnd.openxmlformats-officedocument.presentationml.presentation,application/octet-stream')
driver = webdriver.Firefox(firefox_profile=profile, options=options)
print('Working...')
driver.get(links[location][agency_type])
def get_boxes(location, agency_type, limit, index=0):
print('index: %s' % (index))

boxes = driver.find_elements_by_xpath("//*[contains(@class, 'sc-AykKC') and contains(@class, 'sc-AykKD') and contains(@class,'hfxDgE')]")
while index <= (len(boxes) - 1):
box = boxes[index]
index += 1
link_element = box.find_element_by_xpath(".//a[contains(@href,'profile')]") # period for relative xpath searching
url = link_element.get_attribute('href')
if url not in to_crawl:
to_crawl.append(url)
print('Got box for %s.' % (url))
if index == (len(boxes) - 1):
try:
print('getting next page')
next_page_button = driver.find_element_by_xpath("//a[contains(@href,'page')]")
next_page_button.click()
get_boxes(location, agency_type, limit, index)
except Exception as e:
print(e)
for url_to_crawl in to_crawl:
parse_info(url_to_crawl)
def parse_info(url):
if r.get('scrape-stop-%s' % (session_key)) == 'yes':
return  

# these try/excepts are to determine
# if the xpath exists
# let me know if you find an alternative
# to this ugly method of doing this
driver.get(url)
print('trying url %s' % (url))
annual_revenue = None
num_employees = None
try:
if driver.find_element_by_xpath(
'/html/body/div[1]/div/div[7]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[2]/div[4]/div[1]').text == "Annual Revenue":
annual_revenue = driver.find_element_by_xpath(
'/html/body/div[1]/div/div[7]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[2]/div[4]/div[2]').text
except Exception as e:
print('annual_revenue', e)
try:
if driver.find_element_by_xpath(
'/html/body/div[1]/div/div[7]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]').text == "Number of employees":
num_employees = driver.find_element_by_xpath(
'/html/body/div[1]/div/div[7]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[2]').text
except Exception as e:
print('num_employees', e)
name = driver.find_element_by_xpath('/html/body/div[1]/div/div[4]/div/div[2]/div').text
url = driver.current_url
address_text_list = driver.find_element_by_xpath('/html/body/div[1]/div/div[5]/div/div[3]/div[1]').text.split('n')
if len(address_text_list) == 2:
address = "No Address"
city_state = address_text_list[0]
phone_number = address_text_list[1]
elif len(address_text_list) == 3:
address = address_text_list[0]
city_state = address_text_list[1]
phone_number = address_text_list[2]
location = '%s, %s' % (address, city_state)
try:
uc = Company.objects.create(
annual_revenue=annual_revenue,
num_employees=num_employees,
phone_number=phone_number,
name=name,
url=url,
location=location,
company_type=agency_type,
)
print('Saved %s' % (name))
except Exception as e:
print(e)

get_boxes(location, agency_type, limit)

有人能帮忙吗?提前感谢

我怎么知道它停止了:在生产中,我检查了journalctl -xe,刮到的只是第二家公司。然而,get_boxes函数一直贯穿始终。

gunicorn超时。set--timeout=600,一切正常

最新更新