抓取并将抓取的信息存储到csv文件中



我如何将抓取的信息放入csv文件中,然后关闭选项卡并将其写入新的文件中,并循环使用,直到论坛中的所有页面都被抓取。我仍在学习更多关于网络抓取的知识,我完全困在了这个问题上——需要抓取的div名称是"张贴内容";但当我测试时,它没有显示正确的信息

import driver as driver
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common import window
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
import csv
options = webdriver.ChromeOptions()
options.add_experimental_option("detach", True)
options.add_argument("start-maximized")
wait = WebDriverWait(driver, 100)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get("https://navalcommand.enjin.com/forum/viewforum/2989694/m/11178354/page/1")
elems = driver.find_elements(By.XPATH, "//table[@class='structure small-cells']//a[@href]")
links = []
# create csv file
f = open(r"C:UsersjammiOneDriveDesktopNavcomtest.csv", 'w', encoding='UTF8')
csvWriter = csv.writer(f)
# to open every thread link
for ele in elems:
if "viewthread" in ele.get_attribute("href"):
links.append(ele.get_attribute("href"))
links = list(dict.fromkeys(links))
print(elems)
# to open every link into a new tab
for link in links:
driver.switch_to.new_window(window.WindowTypes.TAB)
driver.get(link)
# write the scraped information to a csv file
content = driver.find_elements(By.CLASS_NAME, "post-content")
print(content)
csvWriter.writerow([content])

这里有一个可能的解决方案:

import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = webdriver.ChromeOptions()
# disable chromedriver log message in cmd
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 10)
driver.get("https://navalcommand.enjin.com/forum/viewforum/2989694/m/11178354/page/1")
# get number of pages
num_pages = driver.find_element(By.CSS_SELECTOR, "span.text.rightmost").text.split(' ')[1]
for page in range(2, int(num_pages)):
# find all threads on the current page
threads = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.thread-view.thread-subject")))
# get links to threads
thread_links = [x.get_attribute('href') for x in threads]
# open each link and get all the posts in thread
for link in thread_links:
driver.get(link)
thread_content = driver.find_elements(By.CSS_SELECTOR, "div.post-content")
# get thread id
thread_id = driver.current_url.split('d/')[1].split('-')[0]
# save received data in csv
for post in thread_content:
post_content = post.text or post.find_element(By.TAG_NAME, 'img').get_attribute('src')
with open(file=f'{thread_id}_navalcommand.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='n')
writer.writerow([post_content])
driver.get(f"https://navalcommand.enjin.com/forum/viewforum/2989694/m/11178354/page/{page}")
driver.quit()

输出是csv文件列表:

32694465_navalcommand.csv
33053469_navalcommand.csv
33079839_navalcommand.csv

每个文件都是一个单独的线程

您可以再次使用xpath进行post-content搜索。

content = driver.find_elements(By.XPATH, "//div[@class='post-content']")

您的URL在通过Selenium访问时被阻止。因此,我使用了另一个URL并修改了您的代码,只需从下面的代码中获取逻辑,并根据您的需要修改定位器:

此代码将从主页中获取所有URL,在新选项卡中迭代并打开每个链接,打印内容并将内容保存到".csv"文件,关闭选项卡,然后移动到下一个链接。

driver.get("https://ubuntuforums.org/forumdisplay.php?f=326")           # Change this URL to your URL
elems = driver.find_elements(By.XPATH, "//*[@class='threadtitle']//a[@href]")           # Change this XPath as per your website
print("Length: ", len(elems))
links = []
# create csv file
f = open(r"C:Users<user name>Downloadstest.csv", 'w', encoding='UTF8')                  # modify this path
csvWriter = csv.writer(f)
# to open every thread link
for i in range(len(elems)):
if "showthread" in elems[i].get_attribute("href"):              # Change 'showthread' to the original - 'viewthread'
links.append(elems[i].get_attribute("href"))
links = list(dict.fromkeys(links))                      # I am not sure why you are using this line here, anyway that's your decision
# print("Elements: ", elems)
print("Links: ", links)
print("")
# to open every link into a new tab
for link in links:
driver.switch_to.new_window(window.WindowTypes.TAB)
driver.get(link)
time.sleep(3)
print("Contents of '", driver.title, "' page")
print("----")
# write the scraped information to a csv file
no_of_content = driver.find_elements(By.CSS_SELECTOR, ".postcontent.restore")               # Change '.postcontent.restore' to the original - '.post-content', but use CSS_SELECTOR
for i in range(len(no_of_content)):
print("Content: ", no_of_content[i].text)
csvWriter.writerow([no_of_content[i].text])
time.sleep(1)
print("=============End of the page=================")
print("")
time.sleep(1)
driver.close()
driver.switch_to.window(driver.window_handles[0])


最新更新