我下面的代码获取每个健身房的街道地址,但是健身房开放时间的输出间距存在错误。知道我哪里出错了吗?
import urlparse
from bs4 import BeautifulSoup
from bs4 import Tag
import requests
import time
import csv
sitemap = 'https://www.planetfitness.com/sitemap'
sitemap_content = requests.get(sitemap).content
soup = BeautifulSoup(sitemap_content, 'html.parser')
atags = soup.select('td[class~=club-title] > a[href^="/gyms"]')
links = [atag.get('href') for atag in atags]
with open('gyms.csv', 'w') as gf:
gymwriter = csv.writer(gf)
for link in links:
gymurl = urlparse.urljoin(sitemap, link)
sitemap_content = requests.get(gymurl).content
soup = BeautifulSoup(sitemap_content, 'html.parser')
gymrow = [ gymurl ]
address_line1 = soup.select('p[class~=address] > span[class~=address-line1]')
gymrow.append(address_line1[0].text)
locality = soup.select('p[class~=address] > span[class~=locality]')
gymrow.append(locality[0].text)
administrative_area = soup.select('p[class~=address] > span[class~=administrative-area]')
gymrow.append(administrative_area[0].text)
postal_code = soup.select('p[class~=address] > span[class~=postal-code]')
gymrow.append(postal_code[0].text)
country = soup.select('p[class~=address] > span[class~=country]')
gymrow.append(country[0].text)
strongs = soup.select('div > strong')
for strong in strongs:
if strong.text == 'Club Hours':
for sibling in strong.next_siblings:
if isinstance(sibling, Tag):
hours = sibling.text
gymrow.append(hours)
break
print(gymrow)
gymwriter.writerow(gymrow)
time.sleep(3)
感谢您的帮助!
要选择包含a
元素的td
元素(类club-title
(,并提取href
属性。
from bs4 import BeautifulSoup
from bs4 import Tag
import requests
import urllib.parse
import time
import csv
sitemap = 'https://www.planetfitness.com/sitemap'
res = requests.get(sitemap).content
soup = BeautifulSoup(res, 'html.parser')
# The rows in the table of gyms are formatted like so:
# <tr>
# <td class="club-title"><a href="/gyms/albertville-al"><strong>Albertville, AL</strong> <p>5850 US Hwy 431</p></a></td>
# <td class="club-join"><div class="button"><a href="/gyms/albertville-al/offers" title="Join Albertville, AL">Join Now</a></div></td>
# </tr>
# This will find all the links to all the gyms.
atags = soup.select('td[class~=club-title] > a[href^="/gyms"]')
links = [atag.get('href') for atag in atags]
with open('gyms.csv', 'w') as gf:
gymwriter = csv.writer(gf)
for link in links:
# Follow the link to this gym
gymurl = urllib.parse.urljoin(sitemap, link)
res = requests.get(gymurl).content
soup = BeautifulSoup(res, 'html.parser')
gymrow = [ gymurl ]
# The address of this gym.
address_line1 = soup.select('p[class~=address] > span[class~=address-line1]')
gymrow.append(address_line1[0].text)
locality = soup.select('p[class~=address] > span[class~=locality]')
gymrow.append(locality[0].text)
administrative_area = soup.select('p[class~=address] > span[class~=administrative-area]')
gymrow.append(administrative_area[0].text)
postal_code = soup.select('p[class~=address] > span[class~=postal-code]')
gymrow.append(postal_code[0].text)
country = soup.select('p[class~=address] > span[class~=country]')
gymrow.append(country[0].text)
# The hours of this gym.
strongs = soup.select('div > strong')
for strong in strongs:
if strong.text == 'Club Hours':
for sibling in strong.next_siblings:
if isinstance(sibling, Tag):
hours = sibling.text
gymrow.append(hours.replace('<br>', '').replace('n', ', '))
break
gymwriter.writerow(gymrow)
time.sleep(3)
当我运行这个时,我得到:
$ more gyms.csv
https://www.planetfitness.com/gyms/albertville-al,5850 US Hwy 431,Albertville,AL,35950,United States,"Monday-Friday 6am-9pm, Sat
urday-Sunday 7am-7pm"
https://www.planetfitness.com/gyms/alexander-city-al,987 Market Place,Alexander City,AL,35010,United States,Convenient hours whe
n we reopen
https://www.planetfitness.com/gyms/bessemer-al,528 W Town Plaza,Bessemer,AL,35020,United States,Convenient hours when we reopen
https://www.planetfitness.com/gyms/birmingham-crestline-al,4500 Montevallo Rd,Birmingham,AL,35210,United States,Convenient hours
when we reopen
.
.
.
要尝试调试它,您应该首先打印出 atags 的值。您正在搜索所有具有类a
标签clubs-list
其中不存在的标签。a
标签没有类,但其父td
具有类club-title
。
你可以试试这样的事情。
res = requests.get("https://www.planetfitness.com/sitemap").content
soup = BeautifulSoup(res, 'html.parser')
tds = soup.find_all('td', {'class': 'club-title'})
links = [td.find('a')['href'] for td in tds]
keywords = ['gyms']
for link in links:
if any(keyword in link for keyword in keywords):
print(link)
这将获取该页面上的每个链接和地址。看起来如果你想找到每个俱乐部的更多信息,你必须反复浏览并加载每个页面。
from bs4 import BeautifulSoup
import requests
res = requests.get("https://www.planetfitness.com/sitemap").content
soup = BeautifulSoup(res, 'html.parser')
atags = soup.find_all('td', {'class':'club-title'})
links = [(atag.find('a')['href'], atag.find('p').text) for atag in atags)]
[print(link) for link in links]