我有以下代码将一些搜索结果打印到控制台:
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
browser = webdriver.Chrome('/Users/Downloads/chromedriver')
browser.get('http://www.google.com')
search = browser.find_element_by_name('q')
search.send_keys("youtube")
search.send_keys(Keys.RETURN)
print(browser)
time.sleep(10)
browser.quit()
输出不正确。为什么?
我写了一个简单的类,你可以使用,你只需要改变webdriver的路径。它是为PhantomJS制作的(你可以在这里下载(,但如果你想使用Chrome(或任何其他网络驱动程序(,只需将第self.driver = webdriver.PhantomJS(path)
行替换为self.driver = webdriver.Chrome(path)
。下面是代码示例:
import time
from urllib.parse import quote_plus
from selenium import webdriver
class Browser:
def __init__(self, path, initiate=True, implicit_wait_time = 10, explicit_wait_time = 2):
self.path = path
self.implicit_wait_time = implicit_wait_time # http://www.aptuz.com/blog/selenium-implicit-vs-explicit-waits/
self.explicit_wait_time = explicit_wait_time # http://www.aptuz.com/blog/selenium-implicit-vs-explicit-waits/
if initiate:
self.start()
return
def start(self):
self.driver = webdriver.PhantomJS(self.path)
self.driver.implicitly_wait(self.implicit_wait_time)
return
def end(self):
self.driver.quit()
return
def go_to_url(self, url, wait_time = None):
if wait_time is None:
wait_time = self.explicit_wait_time
self.driver.get(url)
print('[*] Fetching results from: {}'.format(url))
time.sleep(wait_time)
return
def get_search_url(self, query, page_num=0, per_page=10, lang='en'):
query = quote_plus(query)
url = 'https://www.google.hr/search?q={}&num={}&start={}&nl={}'.format(query, per_page, page_num*per_page, lang)
return url
def scrape(self):
#xpath migth change in future
links = self.driver.find_elements_by_xpath("//h3[@class='r']/a[@href]") # searches for all links insede h3 tags with class "r"
results = []
for link in links:
d = {'url': link.get_attribute('href'),
'title': link.text}
results.append(d)
return results
def search(self, query, page_num=0, per_page=10, lang='en', wait_time = None):
if wait_time is None:
wait_time = self.explicit_wait_time
url = self.get_search_url(query, page_num, per_page, lang)
self.go_to_url(url, wait_time)
results = self.scrape()
return results
path = '<YOUR PATH TO PHANTOMJS>/phantomjs-2.1.1-windows/bin/phantomjs.exe' ## SET YOU PATH TO phantomjs
br = Browser(path)
results = br.search('site:facebook.com inurl:login')
for r in results:
print(r)
br.end()
在java中,它将如下所示:-
List<WebElement> print = driver.findElements(By.xpath("//div[@class='sbqs_c']"));
System.out.println(print.size());
for ( WebElement we: print) {
System.out.println(we.getText());
}
我不是蟒蛇人,但可能会像:-
browser = webdriver.Chrome('/Users/Downloads/chromedriver')
browser.get('http://www.google.com')
search = browser.find_element_by_name('q')
search.send_keys("youtube")
ids = driver.find_elements_by_xpath("//div[@class='sbqs_c']")
for ii in ids:
#print ii.text
print ii.text
源:- 在 Python 中使用索引迭代列表
希望它能帮助你:)
soup=BeautifulSoup(html)
for link in soup.find_all('a'):
print(link.get('href'))
用美丽的汤找到了我自己问题的答案