import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome import service
from selenium.webdriver.common.keys import Keys
import time
import wget
import os
import pandas as pd
import matplotlib.pyplot as plt
urls = [
'https://www.youtube.com/c/LofiGirl/videos',
'https://www.youtube.com/c/Miawaug/videos'
]
for url in urls:
PATH = 'C:webdriverschromedriver.exe.'
driver = webdriver.Chrome(PATH)
driver.get(url)
#driver.maximize_window()
driver.implicitly_wait(10)
for i in range(10):
driver.find_element(By.TAG_NAME, "Body").send_keys(Keys.END)
driver.implicitly_wait(20)
time.sleep(5)
judul_video = []
viewers = []
tanggal_posting = []
titles = driver.find_elements(By.XPATH, "//a[@id='video-title']")
views = driver.find_elements(By.XPATH, "//div[@id='metadata-line']/span[1]")
DatePosted = driver.find_elements(By.XPATH, "//div[@id='metadata-line']/span[2]")
for title in titles:
judul_video.append(title.text)
driver.implicitly_wait(5)
for view in views:
viewers.append(view.text)
driver.implicitly_wait(5)
for posted in DatePosted:
tanggal_posting.append(posted.text)
driver.implicitly_wait(5)
vid_item = {
"video_title" : judul_video,
"views" : viewers,
"date_posted" : tanggal_posting
}
df = pd.DataFrame(vid_item, columns=["video_title", "views", "date_posted"])
#df_new = df.transpose()
print(df)
filename = url.split('/')[-2]
df.to_csv(rf"C:Users.......YouTube_{filename}.csv", sep=",")
driver.quit()
这个代码运行良好,但在这个代码:
for posted in DatePosted:
tanggal_posting.append(posted.text)
driver.implicitly_wait(5)
当一些频道做直播时,比如lofi Girl,我有一个错误说"所有数组必须具有相同的长度";。显然,我未能创建if-else条件来用其他值填充流媒体频道,如Tanggal_posting.append("Live Stream"(或else,或者只是完全跳过从标题开始的提取数据。下面的代码试图跳过或填充其他值,但失败了:
for posted in DatePosted:
if len(posted.text) > 0:
tanggal_posting.append(posted.text)
driver.implicitly_wait(5)
else:
tanggal_posting.append("Live")
driver.implicitly_wait(5)
我如何才能跳过迭代,只为一个显示正在进行直播的视频?或者我如何用其他值来填充该值,例如"0";Live Stream";通过使用我之前提到的if-else条件?提前非常感谢。
就我个人而言,我会首先检查posted
是否适用于.text属性调用。
for posted in DatePosted:
_posted = posted.text.strip() if posted else None
tanggal_posting.append(_posted if _posted else "Live")
driver.implicitly_wait(5)
或者:
for posted in DatePosted:
_posted = posted.text.strip() if posted else None
if not _posted:
continue
tanggal_posting.append(_posted)
driver.implicitly_wait(5)
总体代码应根据您的目标而有所不同。尽管我认为_posted
对其中任何一个都有帮助。
与其为每个数据项收集3个单独的列表,我建议获得视频列表,然后提取每个项目并处理:
videos = driver.find_elements(By.XPATH, "//div[@id='items']/ytd-grid-video-renderer")
for video in videos:
if not video.find_elements(By.XPATH, ".//yt-icon"): # Check if no Streaming icon
title = video.find_element(By.XPATH, ".//a[@id='video-title']")
view = video.find_element(By.XPATH, ".//div[@id='metadata-line']/span[1]")
DatePosted = video.find_element(By.XPATH, ".//div[@id='metadata-line']/span[2]")
请注意,您只需要在脚本开头调用driver.implicitly_wait(<SECONDS>)
一次!