使用selenium将所有列中的数据抓取到python中以加载更多数据



我正在尝试使用selenium-webdriver抓取1853行数据。在收到一些关于堆栈的帮助后,代码生成1080行,但在前400行从..跳过之后。。09至。。65没有加载数据。例如的输出敌人

1008 Cartesi  CTSI/DAI  $10,426  0.00%  -  Percentage  Recently
1009 WETH  WETH/LION  $10,178  0.00%  -  Percentage  Recently
1065 WETH  WETH/DFIO  $7,823  0.00%  -  Percentage  Recently
1066 YAMv2  YAMv2/WETH  $7,794  0.00%  -  Percentage  106 days ago
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get('https://coinmarketcap.com/exchanges/uniswap-v2/')
total_height = int(driver.execute_script("return document.body.scrollHeight"))
for x in range(20):
time.sleep(2)
for i in range(1, total_height, 130):
time.sleep(1)
driver.execute_script("window.scrollTo(0, {});".format(i))
if x == 0:
driver.find_element_by_css_selector('div.cmc-cookie-policy-banner__close').click()
time.sleep(2)
driver.find_element_by_xpath('//button[text() = "Load More"]').click()

time.sleep(2)
first_column = driver.find_elements_by_css_selector('td.cmc-table__cell.cmc-table__cell--sticky.cmc-table__cell--sortable.cmc-table__cell--left.cmc-table__cell--sort-by__rank > div')
second_column = driver.find_elements_by_css_selector('div.cwwgik-0.bCvAgC')
third_column = driver.find_elements_by_css_selector('div.hmd6df-0.kCRNNr')
fourth_column = driver.find_elements_by_css_selector('div.cmc-table__column-market-pair-volume-24h')
fifth_column = driver.find_elements_by_css_selector('div.cmc-table__column-market-pair-volume-percent')
sixth_column = driver.find_elements_by_css_selector('td.cmc-table__cell.cmc-table__cell--sortable.cmc-table__cell--right.cmc-table__cell--sort-by__quote-usd-effective-liquidity > div')
seventh_column = driver.find_elements_by_css_selector('td.cmc-table__cell.cmc-table__cell--sortable.cmc-table__cell--right.cmc-table__cell--sort-by__fee-type > div')
eighth_column = driver.find_elements_by_css_selector('div.ghkc60-0.fLaXDt')
print(len(second_column))
it = 1
for i in range(len(second_column)):
it+=1
print(str(first_column[i].get_attribute("innerText")) + ' ' + str(second_column[i].get_attribute("innerText")) + '  ' + str(third_column[i].get_attribute("innerText")) + '  ' + str(fourth_column[i].get_attribute("innerText")) + '  ' + str(fifth_column[i].get_attribute("innerText")) + '  ' + str(sixth_column[i].get_attribute("innerText")) + '  ' + str(seventh_column[i].get_attribute("innerText")) + '  ' + str(eighth_column[i].get_attribute("innerText")))
print(it)

您可以通过请求获取所有*数据,并点击页面使用的API端点。将限制参数更改为最大值(如果您需要超过限制允许的值,请使用循环并更改起始参数(。在这种情况下,您可以使用获得所有结果

import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint
headers = {'User-Agent':'Mozilla/5.0'}
r = requests.get('https://web-api.coinmarketcap.com/v1/exchange/market-pairs/latest?aux=num_market_pairs,category,fee_type,market_url,currency_name,currency_slug,effective_liquidity&convert=USD,BTC&limit=5000&market_status=active&slug=uniswap-v2&start=1',
headers = headers)
data = r.json()['data']['market_pairs']
print(len(data))
pprint(data)

您可以编写一个自定义函数来生成所有条目,以每行成对输出。TODO:列的一些格式,例如从数字到百分比,从json返回时删除任何您不想要的列,根据需要重命名+重新排序任何列。

格式化和重命名示例:

import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint
import pandas as pd
def get_row(item):
d = {}
for k,v in item.items():
if k in ['market_pair_base', 'market_pair_quote', 'quote']:
for nk, nv in v.items():
if k == 'quote':
for nnk, nnv in nv.items():
d[k +'-' + nk + '-' + nnk] = nnv
else:
d[k +'-' + nk] = nv
else:
d[k] = v
return d

headers = {'User-Agent':'Mozilla/5.0'}
r = requests.get('https://web-api.coinmarketcap.com/v1/exchange/market-pairs/latest?aux=num_market_pairs,category,fee_type,market_url,currency_name,currency_slug,effective_liquidity&convert=USD,BTC&limit=5000&market_status=active&slug=uniswap-v2&start=1',
headers = headers)
data = r.json()['data']['market_pairs']
print(len(data))
results = [get_row(item) for item in data]
df = pd.DataFrame(results)
df['Rank'] = [i for i in range(1, len(data) + 1)]
to_keep = ['Rank','market_pair_base-currency_name', 'market_pair', 'quote-USD-price', 
'quote-USD-volume_24h', 'quote-USD-effective_liquidity', 'category', 'fee_type',
'quote-USD-last_updated']
df = df[to_keep]
df = df.astype({'quote-USD-effective_liquidity': 'int64'})
df['quote-USD-volume_24h'] = df['quote-USD-volume_24h'].apply(lambda x: f"${int(x):,}")
df['quote-USD-price'] = df['quote-USD-price'].apply(lambda x: f"${x :.2f}")
df.rename(columns = {
'market_pair_base-currency_name' : 'Currency', 
'market_pair' : 'Pair',
'quote-USD-price': 'Price',
'quote-USD-volume_24h' : 'Volume (24h)',
'quote-USD-effective_liquidity' : 'Liquidity',
'category' : 'Category',
'fee_type' : 'Fee Type',
'quote-USD-last_updated' : 'Updated'
}, 
inplace = True)
df.head(5)
# df.to_csv('data.csv', encoding = 'utf-8-sig', index=False)

*我假设Volume (%)是使用返回json中的列的值计算的度量?你可以打印results[0]并进行调查吗?

结果中的dict示例:

{'uniswap_info_url': 'https://info.uniswap.org/pair/0x0d4a11d5eeaac28ec3f61d100daf4d40471f1852',
'dextool_url': 'https://www.dextools.io/app/uniswap/pair-explorer/0x0d4a11d5eeaac28ec3f61d100daf4d40471f1852',
'outlier_detected': 0,
'exclusions': None,
'market_pair_base-exchange_symbol': 'WETH',
'market_pair_base-currency_symbol': 'WETH',
'market_pair_base-currency_id': 2396,
'market_pair_base-currency_name': 'WETH',
'market_pair_base-currency_slug': 'weth',
'market_pair_base-currency_type': 'cryptocurrency',
'market_pair_quote-exchange_symbol': 'USDT',
'market_pair_quote-currency_symbol': 'USDT',
'market_pair_quote-currency_id': 825,
'market_pair_quote-currency_name': 'Tether',
'market_pair_quote-currency_slug': 'tether',
'market_pair_quote-currency_type': 'cryptocurrency',
'quote-exchange_reported-price': 2667.608286932708,
'quote-exchange_reported-volume_24h_base': 127187.96129018,
'quote-exchange_reported-volume_24h_quote': 339287659.535759,
'quote-exchange_reported-last_updated': '2021-05-27T03:48:07.000Z',
'quote-BTC-price': 0.0713253375594862,
'quote-BTC-volume_24h': 9071.724272524909,
'quote-BTC-effective_liquidity': 0.024501313774380872,
'quote-BTC-last_updated': '2021-05-27T03:55:02.000Z',
'quote-USD-price': 2672.102873016602,
'quote-USD-volume_24h': 339859316.77661276,
'quote-USD-effective_liquidity': 917.9070603711,
'quote-USD-last_updated': '2021-05-27T03:48:07.000Z',
'market_id': 53239,
'market_pair': 'WETH/USDT',
'category': 'spot',
'fee_type': 'percentage',
'market_url': 'https://app.uniswap.org/#/swap?inputCurrency=0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2&outputCurrency=0xdac17f958d2ee523a2206206994597c13d831ec7'}

相关内容

  • 没有找到相关文章