解析没有类的数据



我有一个python代码

from bs4 import BeautifulSoup
import requests
data0 = []
data1 = []
response = requests.get(
"https://www.comicshoplocator.com/StoreLocatorPremier?query=75077&showCsls=true"
)
soup = BeautifulSoup(response.text, "html.parser")
for tag in soup.find_all('div', class_="LocationName"):
title = tag.text
data0.append({
'title': title
})
for button in soup.find_all('div', class_="LocationDetails"):
for childdiv in button.find_all('div', class_="LocationShopProfile"):
for zb in childdiv.find_all('a'):
if zb.get_text() == 'Shop Profile':
website = zb.get('href')
forsite = requests.get('https://www.comicshoplocator.com/' + website)
soup = BeautifulSoup(forsite.text, "html.parser")
for tag in soup.find_all('div', class_="StoreWeb"):
site = tag.text.replace('Web: http://', '')
data7.append({
'site': site
})
df = pd.DataFrame(columns=['Name', 'Website'])
df[df.columns[0]] = pd.DataFrame(data0)
df[df.columns[1]] = pd.DataFrame(data1)

My print is:

Name                         Website
0       TWENTY ELEVEN COMICS      WWW.TWENTYELEVENCOMICS.COM
1                READ COMICS         www.boomerangcomics.com
2           BOOMERANG COMICS  www.facebook.com/morefuncomics
3  MORE FUN COMICS AND GAMES   www.madnesscomicsandgames.com
4     MADNESS COMICS & GAMES                             NaN
5  SANCTUARY BOOKS AND GAMES                             NaN

正确的字体应该是:

Name                         Website
0       TWENTY ELEVEN COMICS      WWW.TWENTYELEVENCOMICS.COM
1                READ COMICS                             NaN
2           BOOMERANG COMICS         www.boomerangcomics.com
3  MORE FUN COMICS AND GAMES  www.facebook.com/morefuncomics
4     MADNESS COMICS & GAMES   www.madnesscomicsandgames.com
5  SANCTUARY BOOKS AND GAMES                             NaN

一些商店可能没有"LocationShopProfile"或";StoreWeb"类。这就是为什么第二列的顺序不对

我怎样才能解决这个问题?

感谢

尝试:

import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

url = "https://www.comicshoplocator.com/StoreLocatorPremier?query=75077&showCsls=true"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
all_data = []
for shop in soup.select(".CslsLocationItem"):
name = shop.select_one(".LocationName").text
u = shop.select_one(".LocationShopProfile a")
if u:
s = BeautifulSoup(
requests.get(
"https://www.comicshoplocator.com" + u["href"]
).content,
"html.parser",
)
u = s.select_one(".StoreWeb a")
all_data.append((name, u["href"] if u else np.nan))
df = pd.DataFrame(all_data, columns=["Name", "Website"])
print(df.to_markdown(index=False))

打印:

http://WWW.TWENTYELEVENCOMICS.COMhttp://www.boomerangcomics.com更多有趣的漫画和游戏http://www.facebook.com/morefuncomics疯狂漫画&http://www.madnesscomicsandgames.com

最新更新