我有一个使用python和selenium创建的爬虫:见下文(请随意测试并留下评论/技巧!)这在我的Mac(本地)上运行良好,但它说需要大约6天的时间才能提取完整的数据。因此,我决定添加多处理来缩短时间。这在我的Mac上仍然可以完美地工作,但是当我尝试在windows VM (Azure D8s_v3)上运行它时,我得到错误:
DevTools listening on ws://127.0.0.1:56800/devtools/browser/de9e5088-9659-4604-b43f-8ea1fae02a66 [11728:11308:0805/085310.771:ERROR:device_event_log_impl.cc(214)] [08:53:10.782] Bluetooth: bluetooth_adapter_winrt.cc:1073 Getting Default Adapter failed.
当你在windows上运行时,你也会得到这个错误吗?提前谢谢你
# Jonathan Augustin
# BELOW IS THE LINK WE WOULD LIKE YOU TO SCRAPE AS A TEST OF YOUR ABILITY:
# Dixie State University : https://registration.dixie.edu/transfer-guide/
# Please write a python script to extract the “To” and “From” transfer information from the highlighted link.
# The output should be in .JSON format. We would also like you to send the python script as well.
# We want ALL of the transfer information “TO” Dixie State University, “FROM” every other institution in every state.
import requests
from bs4 import BeautifulSoup
import json
import time
from itertools import chain
import logging
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import pprint
import time
import progressbar
import threading
import multiprocessing
jsonClass = {
"from_school": "",
"from_course_department": "",
"from_course_code": "",
"from_course_name": "",
"from_course_credit_hours": "",
"from_extra_department": "",
"from_extra_code": "",
"from_extra_name": "",
"from_extra_credit_hours": "",
"to_school": "Dixie State University",
"to_course_department": "",
"to_course_code": "",
"to_course_name": "",
"to_course_credit_hours": "",
"to_extra_department": "",
"to_extra_code": "",
"to_extra_name": "",
"to_extra_credit_hours": ""
}
states = ['Alabama','Alaska','Arizona','Arkansas','California','Colorado','Connecticut','Delaware','Florida','Georgia','Hawaii','Idaho','Illinois','Indiana','Iowa','Kansas','Kentucky','Louisiana','Maine','Maryland','Massachusetts','Michigan','Minnesota','Mississippi','Missouri','Montana','Nebraska','Nevada','New Hampshire','New Jersey','New Mexico','New York','North Carolina','North Dakota','Ohio','Oklahoma','Oregon','Pennsylvania','Rhode Island','South Carolina','South Dakota','Tennessee','Texas','Utah','Vermont','Virginia','Washington','West Virginia','Wisconsin','Wyoming']
class searchPage(object):
def __init__(self, number):
# #^ The first variable is the class instance in methods.
# # This is called "self" by convention, but could be any name you want.
#^ double underscore (dunder) methods are usually special. This one
# gets called immediately after a new instance is created
PATH = "/Users/jonathanaugustin/Desktop/chromedriver"
options = Options()
self.number = number
self.driver = webdriver.Chrome(PATH, options=options)
url = 'https://widgets.collegetransfer.net/EquivWidget?institution=2734&name=Dixie%20State%20University&theme=/Content/Themes/Selene/jquery-ui-1.8.17.custom.css&direction=receiver&zip=84770-3876'
self.driver.get(url)
def getSchools(self):
beforeScroll = 'initial'
afterScroll = ''
schoolsText = ''
while (beforeScroll != afterScroll):
beforeScroll = afterScroll
schools = self.driver.find_element_by_id('schoolsbyname')
self.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', schools)
time.sleep(0.4)
afterScroll = schools.text[-20:]
schoolsText = schools
mylist = schools.find_elements_by_class_name('selectableContainer')
return mylist
def updateJson(self, numthreads):
# print('numthreads', numthreads)
mylist = self.getSchools()
with progressbar.ProgressBar(max_value=len(mylist)) as bar:
for x in range(len(mylist)):
bar.update(x)
# print('x',x, 'numthreads', numthreads,'mod', (x % numthreads))
# print('self.number', self.number)
if x % numthreads == self.number:
#check to see if school is in US
address = mylist[x].find_element_by_class_name('address').text.split(', ')[1]
# print('SURE', address)
# print(address)
if address not in states:
# print('no')
continue
# print('yes')
jsonClass["from_school"] = mylist[x].get_attribute("data-sendername")
#First click
mylist[x].click()
time.sleep(2)
equivList = self.driver.find_element_by_id('equivcontent')
equivalencies = equivList.find_elements_by_class_name('selectableContainer')
i = 0
for i in range(len(equivalencies)):
fromCourse = equivalencies[i].find_element_by_class_name('equivSourceContainer')
# print(fromCourse.text)
courses = fromCourse.find_elements_by_class_name('course')
course = courses[0].find_element_by_class_name('courseId').text.split()
jsonClass["from_course_department"] = course[0]
jsonClass["from_course_code"] = course[1]
jsonClass["from_course_name"] = fromCourse.find_element_by_class_name('courseTitle').text
if len(courses) > 1:
from_extra_departments = []
from_extra_codes = []
from_extra_names = []
for eClass in courses[1:]:
eClass1 = eClass.find_element_by_class_name('courseId').text.split()
from_extra_departments.append(eClass1[0])
from_extra_codes.append(eClass1[1])
from_extra_names.append(eClass.find_element_by_class_name('courseTitle').text)
jsonClass["from_extra_department"] = str(from_extra_departments)
jsonClass["from_extra_code"] = str(from_extra_codes)
jsonClass["from_extra_name"] = str(from_extra_names)
else:
jsonClass["from_extra_department"] = ""
jsonClass["from_extra_code"] = ""
jsonClass["from_extra_name"] = ""
toCourse = equivalencies[i].find_element_by_class_name('equivTargetContainer')
toCourses = toCourse.find_elements_by_class_name('course')
tcourse = toCourses[0].find_element_by_class_name('courseId').text.split()
jsonClass["to_course_department"] = tcourse[0]
jsonClass["to_course_code"] = tcourse[1]
jsonClass["to_course_name"] = toCourse.find_element_by_class_name('courseTitle').text
if len(toCourses) > 1:
to_extra_departments = []
to_extra_codes = []
to_extra_names = []
for eClass in toCourses[1:]:
eClass1 = eClass.find_element_by_class_name('courseId').text.split()
to_extra_departments.append(eClass1[0])
to_extra_codes.append(eClass1[1])
to_extra_names.append(eClass.find_element_by_class_name('courseTitle').text)
jsonClass["to_extra_department"] = str(to_extra_departments)
jsonClass["to_extra_code"] = str(to_extra_codes)
jsonClass["to_extra_name"] = str(to_extra_names)
else:
jsonClass["to_extra_department"] = ""
jsonClass["to_extra_code"] = ""
jsonClass["to_extra_name"] = ""
#Second click
equivalencies[i].click()
time.sleep(2)
transferList = self.driver.find_elements_by_class_name('courseListContainer')[0]
details = transferList.find_elements_by_class_name('courseDetailContainer')
try:
creditsCont = transferList.find_element_by_class_name('courseCreditsLine')
credits = creditsCont.find_elements_by_tag_name('span')
# print(credits[0].text)
if credits[0].text == "Credits:":
jsonClass["from_course_credit_hours"] = credits[1].text
if len(details) > 1:
from_extra_credit_hours = []
for detail in details[1:]:
try:
detCont = detail.find_element_by_class_name('courseCreditsLine')
detcredits = detCont.find_elements_by_tag_name('span')
# print(detcredits[0].text)
if detcredits[0].text == "Credits:":
from_extra_credit_hours.append(detcredits[1].text)
except:
pass
# print("No credit")
jsonClass["from_extra_credit_hours"] = str(from_extra_credit_hours)
else:
jsonClass["from_extra_credit_hours"] = ""
except:
# print("No credits")
pass
dixieList = self.driver.find_elements_by_class_name('courseListContainer')[1]
todetails = dixieList.find_elements_by_class_name('courseDetailContainer')
try:
creditsCont = dixieList.find_element_by_class_name('courseCreditsLine')
credits = creditsCont.find_elements_by_tag_name('span')
# print(credits[0].text)
if credits[0].text == "Credits:":
jsonClass["to_course_credit_hours"] = credits[1].text
if len(todetails) > 1:
from_extra_credit_hours = []
for detail in todetails[1:]:
try:
detCont = detail.find_element_by_class_name('courseCreditsLine')
detcredits = detCont.find_elements_by_tag_name('span')
# print(detcredits[0].text)
if detcredits[0].text == "Credits:":
from_extra_credit_hours.append(detcredits[1].text)
except:
# print("No credit")
pass
jsonClass["to_extra_credit_hours"] = str(from_extra_credit_hours)
else:
jsonClass["to_extra_credit_hours"] = ""
except:
# print("No credits")
pass
# pp = pprint.PrettyPrinter(indent=4)
# pp.pprint(jsonClass)
my_file = open("dixie.json", "a") # Open a file
my_file.write(json.dumps(jsonClass, indent=4)) # write a line to the file
my_file.write(",")
my_file.close()
self.driver.find_element_by_id('detail').find_element_by_class_name('ui-corner-top').click()
time.sleep(2)
self.driver.find_element_by_id('equivs').find_element_by_class_name('ui-state-default').click()
time.sleep(2)
def tearDown(self):
self.driver.close()
if __name__ == "__main__":
my_file = open("dixie.json", "w") # Open a file # write a line to the file
my_file.write("[")
my_file.close()
a = searchPage(0)
b = searchPage(1)
c = searchPage(2)
d = searchPage(3)
e = searchPage(4)
f = searchPage(5)
g = searchPage(6)
t1 = multiprocessing.Process(target=a.updateJson, args=[7])
t2 = multiprocessing.Process(target=b.updateJson, args=[7])
t3 = multiprocessing.Process(target=c.updateJson, args=[7])
t4 = multiprocessing.Process(target=d.updateJson, args=[7])
t5 = multiprocessing.Process(target=e.updateJson, args=[7])
t6 = multiprocessing.Process(target=f.updateJson, args=[7])
t7 = multiprocessing.Process(target=g.updateJson, args=[7])
t1.start()
t2.start()
t3.start()
t4.start()
t5.start()
t6.start()
t7.start()
t1.join()
t2.join()
t3.join()
t4.join()
t5.join()
t6.join()
t7.join()
a.tearDown()
b.tearDown()
c.tearDown()
d.tearDown()
e.tearDown()
f.tearDown()
g.tearDown()
my_file = open("dixie.json", "a") # Open a file # write a line to the file
my_file.write("]")
my_file.close()
我可以通过使用以下命令之一来解决这个问题
> options.add_argument('--no-sandbox')
> options.add_argument('--headless')
> options.add_argument('--disable-gpu')