所以我写了一些代码来获取大学课程的数据,以构建一个交互式调度程序。这是我获取数据的代码:
from selenium import webdriver
import os
import pwd
import shlex
import re
import time
usr = pwd.getpwuid(os.getuid()).pw_name
Path = ('/Users/%s/Downloads/chromedriver') %usr # Have chromedriver dowloaded
# Create a new instance of the Chrome driver
options = webdriver.ChromeOptions()
options.binary_location = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
options.add_argument('headless') # Headless so no window is opened
options.add_argument('window-size=1200x600')
driver = webdriver.Chrome(Path, chrome_options=options)
driver.get('https://web.stevens.edu/scheduler/core/2017F/2017F.xml') # Go to database
classes = {}
def Database(AllSelectedCourseInfo):
ClassDict = {}
for item in AllSelectedCourseInfo: # Go through list of class info
try:
thing = item.split("=") # Split string by = to get subject name and value
name = thing[0]
if any(char.isdigit() for char in thing[1]): # Get rid of annoying Z at the end of numbers
thing[1] = re.sub("[Z]","",thing[1])
value = thing[1]
if value: # If subject has a value, store it
ClassDict[str(name)] = str(value) # Store value in a dictionary with the subject as the key
except:
pass
classes[str(ClassDict["Section"])] = ClassDict # Add to dictionary
def makeDatabase(section):
if "Title" in driver.find_element_by_xpath("//*[text()='%s']"%section).find_element_by_xpath("..").text:
classSection = driver.find_elements_by_xpath("//*[text()='%s']"%section) # If class name given find class
for i in range(0, len(classSection)):
AllSelectedCourseInfo = shlex.split(classSection[i].find_element_by_xpath(".." + "/.."*4).text.replace("/>", "").replace(">", "")) # sort into a list grouping string in quotes and getting rid of unnecessary symbols
Database(AllSelectedCourseInfo)
else:
classSection = driver.find_element_by_xpath("//*[text()='%s']"%section) # If class section give, find class
AllSelectedCourseInfo = shlex.split(classSection.find_element_by_xpath(".." + "/.."*3).text.replace("/>", "").replace(">", "")) # sort into a list grouping string in quotes and getting rid of unnecessary symbols
Database(AllSelectedCourseInfo)
def printDic():
for key in classes:
print "n-------------%s------------" %key
for classkey in classes[key]:
print "%s : %s" %(classkey, classes[key][classkey])
start = time.time()
makeDatabase("Differential Calculus")
makeDatabase("MA 124B")
printDic()
end = time.time()
print end - start
driver.quit()
我从一个类和一个类部分提取数据大约需要 20 秒,如果我要使其实用,它至少需要 7 个类,而创建字典就需要一分钟多的时间。有谁知道一种方法可以让它运行得更快?
我试图将 lxml 和请求集成到我的代码中,但它只是没有我想要的东西。经过几天尝试使用 lxml 来完成此操作无济于事,我决定尝试使用 urllib 的 beautifulsoup4。这比我希望的要好,
from bs4 import BeautifulSoup
from HTMLParser import HTMLParser
import urllib
import shlex
import re
import time
h = HTMLParser()
page = urllib.urlopen('https://web.stevens.edu/scheduler/core/2017F/2017F.xml').read() # Get to database
soup = BeautifulSoup(page)
RawClassData = soup.contents[10].contents[0].contents[0].contents
classes = {}
backupClasses = {}
def makeDatabase():
for i in range(0, len(RawClassData)): # Parse through each class
try:
AllSelectedCourseInfo = shlex.split(h.unescape(str(RawClassData[i]).replace(">", " "))) # sort into a list grouping string in quotes and getting rid of unnecessary symbols
ClassDict = {}
for item in AllSelectedCourseInfo: # Go through list of class info
try:
thing = item.split("=") # Split string by = to get subject name and value
name = thing[0]
if any(char.isdigit() for char in thing[1]): # Get rid of annoying Z at the end of numbers
thing[1] = re.sub("[Z]","",thing[1])
value = thing[1]
if value: # If subject has a value, store it
ClassDict[str(name)] = str(value) # Store value in a dictionary with the subject as the key
except:
pass
classes[str(ClassDict["section"])] = ClassDict
except:
pass
def printDic():
with open("Classes", "w") as f:
for key in classes:
f.write("n-------------%s------------" %key)
for classkey in classes[key]:
f.write( "n%s : %s" %(classkey, classes[key][classkey]))
f.write("n")
def printSection(selection):
print "n-------------%s------------" %selection
for classkey in classes[selection]:
print "%s : %s" %(classkey, classes[selection][classkey])
def printClass(selection):
try:
for key in classes:
if classes[key]["title"] == selection:
print "n-------------%s------------" %key
for classkey in classes[key]:
print "%s : %s" %(classkey, classes[key][classkey])
finally:
print "n-------------%s------------" %selection
for classkey in classes[selection]:
print "%s : %s" %(classkey, classes[selection][classkey])
start = time.time()
makeDatabase()
end = time.time()
printClass("Circuits and Systems")
printClass("Differential Equations")
printClass("Writing & Communications Collqm")
printClass("Mechanics of Solids")
printClass("Electricity & Magnetism")
printClass("Engineering Design III")
printClass("Freshman Quiz")
printDic()
print end - start
这个新代码创建一个包含所有类的库,然后在 2 秒内打印出所需的类。硒代码花了 89 秒来构建所需类的库并打印出来,我会说这是一个轻微的改进......感谢完美第五的建议!