Instagram在尝试将URL附加到URL列表时使用Selenium进行刮擦时出现问题



伙计们,我可能有一个棘手的问题。我试图制作一个机器人,下载instagram帐户的所有照片/视频URL,将它们附加到列表中,最后将它们保存到文件中。但是,当我观察它是否有效时,我发现URL列表一直包含51个URL,每次我在程序运行时添加新的URL时,列表上的这些URL都会随着新的51个URL而变化,最后一个URL会从列表中删除,而不是将它们添加到列表中的现有URL中,然后继续添加新的。为什么会发生这样的事情?我需要你们的知识伙计们:(

机器人程序的代码如下:

#Here is the run.py from where I'm running the program

import os
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
import autoit
from selenium.webdriver.common.keys import Keys
import requests
import coockies
import PopUpsClose
import login
import link
import url_extraxction

def main():
#Makes an mobile emulator to start Instagram like a smartphone 
mobile_emulation = {
"deviceMetrics": { "width": 360, "height": 640, "pixelRatio": 3.0 },
"userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19" }
chrome_options = Options()
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
browser = webdriver.Chrome(chrome_options = chrome_options)
browser.get('https://www.instagram.com/accounts/login/')
coockies.close_coockies(browser)
login.Insta_login(browser)
PopUpsClose.pop_up(browser)
######################################
#Here it takes the url from the file 
url = link.page_link(browser)
browser.get(url)
sleep(5)
#Scrolling down to the page and getting the URLS
url_extraxction.extract(browser, url)


main()

Here is the login function

from time import sleep
def Insta_login(browser):
login_file = open(r'C:UsersbilakosDesktopPYTHON_PROJECTSInstaAutoPhotoUploadlogin.txt', 'r')
username = login_file.readline()
while username != '':
password = login_file.readline()
username_ = username.rstrip("n")
password = password.rstrip("n")
username = login_file.readline()
sleep(2)
browser.find_element_by_xpath("""//*[@id="loginForm"]/div[1]/div[3]/div/label/input""").send_keys(username_)
browser.find_element_by_xpath("""//*[@id="loginForm"]/div[1]/div[4]/div/label/input""").send_keys(password) 
sleep(2)
browser.find_element_by_xpath("""/html/body/div[1]/section/main/div[1]/div/div/div/form/div[1]/div[6]/button/div""").click()
sleep(10)
login_file.close()

Here is the coockies function

def close_coockies(browser):
coockies_accept = browser.find_element_by_xpath("""/html/body/div[2]/div/div/div/div[2]/button[1]""")
coockies_accept.click()

Here is the PopUpsClose function

from time import sleep
def pop_up(browser):
#Εδώ βρίσκει που είναι σημείο για να κλείσει το 1ο Pop Up
not_now_button = browser.find_element_by_xpath("""/html/body/div[1]/section/main/div/div/div/button""")
not_now_button.click()
sleep(10)
#Εδώ βρίσκει που είναι σημείο για να κλείσει το 2ο Pop Up
not_now_button2 = browser.find_element_by_xpath("""/html/body/div[4]/div/div/div/div[3]/button[2]""")
not_now_button2.click()
sleep(2)

And last is the url_extraction function在哪里我有问题

from time import sleep
import requests
import os

def extract(browser, url):
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 OPR/73.0.3856.329"}
requests.get(url, headers = header)
#SCROLL DOWN
print("This process maybe it will take like 5 minutes.n", "Don't close the program......")
last_height = 0
proceed = ''
while True:
browser.execute_script('window.scrollTo(0, document.body.scrollHeight);')
sleep(1)
#GET THE URLS
elements = browser.find_elements_by_xpath('//a[@href]')
links = []
for elem in elements:
urls = elem.get_attribute('href')
if urls not in links and 'p' in urls.split('/'):
links.append(urls)
print(links)
sleep(2)
new_height = browser.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
if False:
proceed = False
else:
proceed = True
sleep(10)
#Create a folder with the name of the profile
if proceed == True:
name = browser.find_element_by_class_name("_7UhW9.fKFbl.yUEEX.KV-D4.fDxYl")
text = name.text
print("Wait to create a Folder to pass the extracted links.nPlease don't close the program.")
print('' * 2)
sleep(5)
path = "C:\Users\bilakos\Desktop\PYTHON_PROJECTS\InstaAutoPhotoUpload\" + text
sleep(2)
try:
os.mkdir(path)
link_extraction = open('C:\Users\bilakos\Desktop\PYTHON_PROJECTS\InstaAutoPhotoUpload\' + text
+ '\extracted_links.txt', 'w')
sleep(2)
print("The extracted_links.txt file is created.")
print('' * 2)
for i in links:
link_extraction.write(i + 'n')
link_extraction.close()
sleep(2)
print('The links transferred succesfully to the file.')
except FileExistsError:
print('The file already exist.')
link_extraction = open('C:\Users\bilakos\Desktop\PYTHON_PROJECTS\InstaAutoPhotoUpload\' + text
+ '\extracted_links.txt', 'w')
sleep(2)
print("The extracted_links.txt file is created.")
print('' * 2)
for i in links:
link_extraction.write(i + 'n')
link_extraction.close()
sleep(2)
print('The links transferred successfully to the file.')

url_extraction函数内部,我有一个#GET THE URLS,之后就是问题发生的地方。

在while循环中,每次滚动时都会重新定义列表。所以实际上你只保存最后一个滚动到文件。

def extract(browser, url):
...
while True:
# scroll down
...
links = [] # <--- (1) ---
for elem in elements:
urls = elem.get_attribute('href')
if urls not in links and 'p' in urls.split('/'):
links.append(urls) # <--- (2) ---
print(links)
...
# check if at end and if yes then break out of loop

在(1(中,您正在定义一个新列表。在(2(处,您将添加到列表中。但是在while循环的下一次迭代中,您将再次在(1(处定义一个新列表,并且前面的项目将丢失。

要保留结果,必须在while循环之外定义列表。

def extract(browser, url):
...
links = [] # <--- (1) ---
while True:
# scroll down
...
for elem in elements:
urls = elem.get_attribute('href')
if urls not in links and 'p' in urls.split('/'):
links.append(urls) # <--- (2) ---
print(links)
...
# check if at end and if yes then break out of loop

最新更新