如果元素的类型是列表,不要把元素放在列表中,而是放在原始元素中,因为它已经有了偏执狂——这似乎不起作用



我有一个if语句,它检查它是否为";列表";。如果是这样的话,不要把它作为类型列表放在字典中,而是把它放在原始位置,因为它已经是一个列表了。(保存到字典中(

对于其他元素,按应分配元素(保存到字典中(

然而,if语句似乎从未执行

请参阅下面提到的代码部分,

print(type(contentAggregator[8]))
print(type(contentAggregator[9]))
print(type(contentAggregator[10]))
for i, k in enumerate(contentDict):
if type(contentAggregator[i]) == isinstance(contentAggregator[i],list):
contentDict[k] = contentAggregator[i]
else:     
contentDict[k] = [contentAggregator[i]]
print(contentAggregator)
print(contentDict)

上面的代码进行了检查,我已经验证了元素的类型是"list",但由于某种原因,我的if语句似乎从未执行过。真正棘手的。。。

# This Python file uses the following encoding: utf-8
import pymysql
from sqlalchemy import create_engine
pymysql.install_as_MySQLdb()
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium import webdriver as wd
from Stamprally import StamprallyInfo
import re
import pandas as pd
import numpy as np
import urllib.request
import math
import time
import sys
import os
import MySQLdb
programStart = time.time()

prefectureNameList = ["北海道", "青森県", "岩手県", "宮城県", "秋田県", "山形県", "福島県", "茨城県", "栃木県", "群馬県", "埼玉県", "千葉県", "東京都", "神奈川県", "新潟県", "富山県", "石川県", "福井県", "山梨県", "長野県", "岐阜県", "静岡県",
"愛知県", "三重県", "滋賀県", "京都府", "大阪府", "兵庫県", "奈良県", "和歌山県", "鳥取県", "島根県", "岡山県", "広島県", "山口県", "徳島県", "香川県", "愛媛県", "高知県", "福岡県", "佐賀県", "長崎県", "熊本県", "大分県", "宮崎県", "鹿児島県", "沖縄県"]

data = []
timeStampData = []
contentAggregator = []
timeStampData.append("프로그램 시작")
timeStampData.append(programStart)
main_url = 'https://stamprally.org/'
programEnd = time.time()
timeStampData.append(programEnd - programStart)
timeStamp = pd.DataFrame(np.array([timeStampData]), columns=[
'설명', 'TimeStamp', '소요기간'])
timeStampData.clear()
timeStampData.append("셀레니엄 드라이버 로딩")
seleniumStart = time.time()
timeStampData.append(seleniumStart)
driver = wd.Chrome(executable_path='chromedriver.exe')
driver.get(main_url)
seleniumEnd = time.time()
timeStampData.append(seleniumEnd - seleniumStart)
rowAddTimeStampSelenium = pd.Series(timeStampData, index=timeStamp.columns)
timeStamp = timeStamp.append(rowAddTimeStampSelenium, ignore_index=True)
timeStampData.clear()
prefectureValueStorage = [x.get_attribute('value') for x in driver.find_elements_by_xpath(
"//select[@name='search_cat1']/option[@class='level-1']")]
prefectureNameIterator = 0
for prefectureValue in prefectureValueStorage:
prefectureStart = time.time()
timeStampData.append(prefectureNameList[prefectureNameIterator])
timeStampData.append(prefectureStart)
driver.get(
f"https://stamprally.org/?search_keywords&search_keywords_operator=and&search_cat1={prefectureValue}&search_cat2=0")
imageDownloadCounter = 1
totalList = driver.find_element_by_css_selector(
'div.page_navi2.clearfix>p').text 
totalListNum = totalList.split("件中")
if int(totalListNum[0]) % 10 != 0:
pageLoopCount = math.ceil((int(totalListNum[0])/10))
else:
pageLoopCount = int(totalListNum[0])/10
currentpage = 0
while currentpage < pageLoopCount:
currentpage += 1
driver.get(
f"https://stamprally.org/?search_keywords&search_keywords_operator=and&search_cat1={prefectureValue}&search_cat2=0&paged={currentpage}")
urlList = []
currentUrlCounter = 0
listURLContainer = driver.find_elements_by_css_selector(
'#post_list2 > li > a')
for url in listURLContainer:
urlList.append(url.get_attribute('href'))
for listURL in listURLContainer:
contentAggregator = []
contentAggregator.append(int(totalListNum[0]))
contentAggregator.append(
prefectureNameList[prefectureNameIterator])
contentAggregator.append(
urlList[currentUrlCounter])
driver.get(urlList[currentUrlCounter])
currentUrlCounter += 1
locationTag = [x.get_attribute('title') for x in driver.find_elements_by_xpath(
"//*[@id='post_meta_top']/li[1]/a[@class='cat-category']")]
contentAggregator.append(locationTag)
eventTag = [x.get_attribute('title') for x in driver.find_elements_by_xpath(
"//*[@id='post_meta_top']/li[2]/a[@class='cat-category2']")]
contentAggregator.append(eventTag)
availablePeriod = (driver.find_element_by_css_selector(
'div#post_date')).text.split("( ")
availablePeriodFormatted = availablePeriod[0].replace("開催期間:", "")
availableStartDate = availablePeriod[0].split(" ~ ")
endDate = availableStartDate[1]
availableStartDateFormatted = availableStartDate[0].replace(
"開催期間:", "")
lastUpdatedDate = driver.find_element_by_css_selector(
'time.entry-date.updated').text
contentAggregator.append(availablePeriodFormatted)
contentAggregator.append(availableStartDateFormatted)
contentAggregator.append(endDate)
contentAggregator.append(lastUpdatedDate[6:])
mainImageUrl = driver.find_element_by_css_selector(
'img.attachment-post-thumbnail.size-post-thumbnail.wp-post-image').get_attribute('src')
contentAggregator.append(mainImageUrl)
postTitle1 = driver.find_element_by_css_selector(
'h2#post_title').text.replace("開催終了", "")
postTitle = postTitle1.replace("ただいま開催中", "")
removeSpecialChars = postTitle.translate(
{ord(c): " " for c in "!@#$%^&*()[]{};:,./<>?|`~-=_+"})
postTitle = removeSpecialChars
contentAggregator.append(postTitle)
eventValidity = driver.find_element_by_xpath(
"//*[@id='post_title']/span").text
contentAggregator.append(eventValidity)
urllib.request.urlretrieve(mainImageUrl, (str(
prefectureNameList[prefectureNameIterator])+postTitle+str(imageDownloadCounter) + ".png"))
imageDownloadCounter += 1
prefectureNameIterator += 1
innerWebSiteButtonURL = driver.find_element_by_css_selector(
'div.post_content.clearfix > div >a').get_attribute('href')
contentAggregator.append(innerWebSiteButtonURL)
mainText = driver.find_elements_by_css_selector(
'div.post_content.clearfix > p')
mainContentText = []
for mainContentDetail in mainText:
mainContentText.append(mainContentDetail.text)
mainContextTextCount = len(mainContentText)-1
contentAggregator.append(mainContentText[:mainContextTextCount])
contentReorder = [1, 0, 10, 11, 5, 6, 7, 8, 13, 3, 4, 9, 12, 2]
contentAggregator = [contentAggregator[i] for i in contentReorder]
data = data.append(contentAggregator)
df = pd.DataFrame(data, columns=["Prefecture", "Total List Number", "Title", "Event Validity", "Available Period", "Available StartDate",
"End Date", "Last Updated",  "mainText", "Location Tag", "Event Tag", "Main Image URL", "innerWebSiteURL", "ListLink"])
contentDict = {
"Prefecture": "",
"Total List Number": "",
"Title": "",
"Event Validity": "",
"Available Period": "",
"Available StartDate": "",
"End Date": "",
"Last Updated": "",
"mainText": "",
"Location Tag": "",
"Event Tag": "",
"Main Image URL": "",
"innerWebSiteURL": "",
"ListLink": ""
}
print(type(contentAggregator[8]))
print(type(contentAggregator[9]))
print(type(contentAggregator[10]))
for i, k in enumerate(contentDict):
if type(contentAggregator[i]) == isinstance(contentAggregator[i],list):
contentDict[k] = contentAggregator[i]
else:     
contentDict[k] = [contentAggregator[i]]
print(contentAggregator)
print(contentDict)

engine = create_engine("mysql+mysqldb://root:abcdefgH1@localhost/stamprallydb", encoding='utf-8')
df2 = pd.DataFrame(data=contentDict)
#df2=df.transpose()
conn = engine.connect()
df2.to_sql(name='stamprallydb_crawl_result',
con=engine, if_exists='append', index=True)
else:
prefectureEnd = time.time()
timeStampData.append(prefectureEnd-prefectureStart)
rowAddTimeStampPrefecture = pd.Series(
timeStampData, index=timeStamp.columns)
timeStamp = timeStamp.append(
rowAddTimeStampPrefecture, ignore_index=True)
timeStampData.clear()
excelFileStart = time.time()
xlwriter = pd.ExcelWriter('StampRally_Crawler.xlsx')
df.to_excel(xlwriter, sheet_name="Stamprally.org Crawl Result")
excelFileEnd = time.time()
timeStampData.append("엑셀 파일 저장")
timeStampData.append(excelFileStart)
timeStampData.append(excelFileEnd-excelFileStart)
rowAddTimeStampPrefecture = pd.Series(timeStampData, index=timeStamp.columns)
timeStamp = timeStamp.append(rowAddTimeStampPrefecture, ignore_index=True)
timeStamp.to_excel(xlwriter, sheet_name="TimeStamp Result")
xlwriter.close()
driver.close()
driver.quit()
sys.exit()

type返回传递给它的参数的类型对象。isinstance返回一个布尔值,描述第一个参数是否是第二个参数的实例。他们永远不可能是平等的。长话短说,去掉type表达式,只计算isinstance表达式:

if isinstance(contentAggregator[i], list):

最新更新