如何将不在表中的信息放入网络抓取的数据帧中


import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import requests
from time import sleep
from random import randint
import re
towns = pd.DataFrame()
town_names = [f"Abbeville-Alabama",
f"Abernant-Alabama",
f"Alpine-Utah",
f"Dixon-Montana",
f"Adak-Alaska",]
for town_name in town_names:
page = requests.get(f"https://www.city-data.com/city/{town_name}.html").text
doc = BeautifulSoup(page, "html.parser")
print(town_name)
sex_population = str(doc.find(id="population-by-sex"))
(males, females) = [float(x) for x in re.findall(r"(?<=()[0-9]+.[0-9]+(?=%))", sex_population)]
print(males, females)
# poverty_level = str(doc.find(id="poverty-level"))
# broke = float(re.findall("(<?</b> )[0-9]*.[0-9]*", poverty_level))
# print(broke)
# religion_population = str(doc.find(id="religion"))
# atheist = float(re.findall("(?<=None</td><td>)[0-9,]*(?=</td><td>)", religion_population)[0].replace(",", ""))
# print(atheist)
total_population = str(doc.find(id="city-population"))
residents = float(re.findall("(?<=</b> )[0-9]*", total_population)[0].replace(",", ""))
print(residents)
religion_population = doc.find(id="religion").find_all('tr')
data = []
for row in religion_population:
columns = row.find_all('td')
if columns:
religion = columns[0].get_text(strip=True)
number = columns[1].get_text(strip=True).replace(",", "").replace("-","0")
print(f'religion: {religion} | number: {number}')
data.append([religion, int(number)])
df = pd.DataFrame(data, columns=['religion', 'number'])
df['percentage'] = (df['number'] / df['number'].sum()) * 100
atheist=df[df.religion == "None"].iloc[0]["percentage"]
evangelicals = df[df.religion == "Evangelical Protestant"].iloc[0]["percentage"]
print(atheist)
print(evangelicals)
education_population = doc.find(id="education-info").find_all('b')
data = []
for row in education_population:
columns = row.find_all('b')
if columns:
education = columns[0].get_text(strip=True)
ed_number = columns[1].get_text(strip=True).replace(",", "").replace("-", "0")
print(f'education: {education} | number: {ed_number}')
data.append([education, int(ed_number)])
df = pd.DataFrame(data, columns=['education', 'number'])
df['percentage'] = (df['number'] / df['number'].sum()) * 100
phds = df[df.education == "Graduate or professional degree"].iloc[0]["percentage"]
highschoolgrads = df[df.education == "High school or higher"].iloc[0]["percentage"]
print(phds)
print(highschoolgrads)
print("n")

如何将教育信息输入数据帧?我试图将这些价值观组织成教育水平和百分比。

还有,你知道为什么当我试图把贫困水平作为破产的浮动指标时,它说不能,因为这是一个列表吗?

在这一点上,我只是打字,以便stackoverflow允许我发布,因为它认为我没有足够的细节。所以……如果有人对此感兴趣的话,我正在做一个数据挖掘/机器学习项目,它将从人权委员会的市政平等指数中获取分数和有关得分城市的信息,并尝试学习如何估计尚未得分城市的分数。这是我的第一个机器学习项目。我担心我第一次选的东西太大了,但我真的已经下定决心了

刮刀出现问题。

在教育中,在URL中,值似乎在list元素li中,而title/labelli内的b中。在scraper中,你只是在收集b标签,所以你错过了required numeric value。你可以改变它使它工作。谢谢

最新更新