我试图在一个未检索到的页面上检索pageviews信息,而其他页面则检索到了。我得到错误:
File "<unknown>", line 1
article =='L'amica_geniale_ (serie_di_romanzi )'
^
SyntaxError: invalid syntax
但是文本中没有空格。该页面为:https://it.wikipedia.org/wiki/L%27amica_geniale_(serie_di_romanzi)
代码为:
start_date = "2005/01/01"
headers = {
'User-Agent': 'Mozilla/5.0'
}
def wikimedia_request(page_name, start_date, end_date = None):
sdate = start_date.split("/")
sdate = ''.join(sdate)
r = requests.get(
"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/all-agents/{}/daily/{}/{}".format(page_name,sdate, edate),
headers=headers
)
r.raise_for_status() # raises exception when not a 2xx response
result = r.json()
df = pd.DataFrame(result['items'])
df['timestamp'] = [i[:-2] for i in df.timestamp]
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace = True)
return df[['article', 'views']]
df = wikimedia_request(name="Random", start_date)
names = ["L'amica geniale"]
dfs = pd.concat([wikimedia_request(x, start_date) for x in names])
除此页面外,代码都能正常工作。我想可能是撇号
注意您使用的url。'it.wikipedia.org'
和'en.wikipedia.org'
之间存在差异
但使用正确的url时效果很好。你可以这样做来解释它:
import requests
import pandas as pd
import datetime
start_date = "2005/01/01"
headers = {
'User-Agent': 'Mozilla/5.0'
}
def wikimedia_request(page_name, start_date, end_date = None):
sdate = start_date.split("/")
sdate = ''.join(sdate)
if end_date == None:
end_date = datetime.datetime.now()
edate = end_date.strftime("%Y%m%d")
try:
lang = 'en'
url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{}.wikipedia.org/all-access/all-agents/{}/daily/{}/{}".format(lang, page_name,sdate, edate)
r = requests.get(url, headers=headers)
r.raise_for_status() # raises exception when not a 2xx response
except:
lang = 'it'
url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{}.wikipedia.org/all-access/all-agents/{}/daily/{}/{}".format(lang, page_name,sdate, edate)
r = requests.get(url, headers=headers)
r.raise_for_status() # raises exception when not a 2xx response
result = r.json()
df = pd.DataFrame(result['items'])
df['timestamp'] = [i[:-2] for i in df.timestamp]
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace = True)
return df[['article', 'views']]
#df = wikimedia_request(name="Random", start_date)
names = ["L'amica geniale_(serie_di_romanzi)", "L'amica geniale"]
dfs = pd.concat([wikimedia_request(x, start_date) for x in names])
输出:
print(dfs)
article views
timestamp
2018-11-21 L'amica_geniale_(serie_di_romanzi) 499
2018-11-22 L'amica_geniale_(serie_di_romanzi) 909
2018-11-23 L'amica_geniale_(serie_di_romanzi) 739
2018-11-24 L'amica_geniale_(serie_di_romanzi) 696
2018-11-25 L'amica_geniale_(serie_di_romanzi) 1449
... ...
2022-03-06 L'amica_geniale 30
2022-03-07 L'amica_geniale 24
2022-03-08 L'amica_geniale 15
2022-03-09 L'amica_geniale 28
2022-03-10 L'amica_geniale 18
[3499 rows x 2 columns]