import pandas as pd
import csv
import re
import json
import requests
def Table():
table = pd.read_json("https://www.nmc.org.in/MCIRest/open/getPaginatedData?service=getPaginatedDoctor&draw=1&columns%5B0%5D%5Bdata%5D=0&columns%5B0%5D%5Bname%5D=&columns%5B0%5D%5Bsearchable%5D=true&columns%5B0%5D%5Borderable%5D=true&columns%5B0%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B0%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B1%5D%5Bdata%5D=1&columns%5B1%5D%5Bname%5D=&columns%5B1%5D%5Bsearchable%5D=true&columns%5B1%5D%5Borderable%5D=true&columns%5B1%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B1%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B2%5D%5Bdata%5D=2&columns%5B2%5D%5Bname%5D=&columns%5B2%5D%5Bsearchable%5D=true&columns%5B2%5D%5Borderable%5D=true&columns%5B2%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B2%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B3%5D%5Bdata%5D=3&columns%5B3%5D%5Bname%5D=&columns%5B3%5D%5Bsearchable%5D=true&columns%5B3%5D%5Borderable%5D=true&columns%5B3%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B3%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B4%5D%5Bdata%5D=4&columns%5B4%5D%5Bname%5D=&columns%5B4%5D%5Bsearchable%5D=true&columns%5B4%5D%5Borderable%5D=true&columns%5B4%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B4%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B5%5D%5Bdata%5D=5&columns%5B5%5D%5Bname%5D=&columns%5B5%5D%5Bsearchable%5D=true&columns%5B5%5D%5Borderable%5D=true&columns%5B5%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B5%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B6%5D%5Bdata%5D=6&columns%5B6%5D%5Bname%5D=&columns%5B6%5D%5Bsearchable%5D=true&columns%5B6%5D%5Borderable%5D=true&columns%5B6%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B6%5D%5Bsearch%5D%5Bregex%5D=false&order%5B0%5D%5Bcolumn%5D=0&order%5B0%5D%5Bdir%5D=asc&start=20000&length=8751&search%5Bvalue%5D=&search%5Bregex%5D=false&name=®istrationNo=&smcId=&year=2020&_=1611587198138")['data']
with open('C:\Users\SmartDB\Desktop\2020_out_2.csv', 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(
['Year Of The Info', 'Registration#', 'State Medical Councils', 'Name', 'FatherName'])
data = []
for item in table:
writer.writerow(item[1:6])
required = item[6]
match = re.search(
r"openDoctorDetailsnew('([^']*)', '([^']*)'", required)
data.append(match.group().split("'")[1:4:2])
print("Data Saved Into out.csv")
return data
def Details():
names = []
items = []
for doc, val in Table():
print(f"Extracting DoctorID# {doc}, RegValue# {val}")
json = {'doctorId': doc, 'regdNoValue': val}
r = requests.post("https://www.nmc.org.in/MCIRest/open/getDataFromService?
service=getDoctorDetailsByIdImr",json=json).json()
try:
if r.keys() not in names:
names.append(r.keys())
items.append(r.values())
except:
pass
print("Done")
return names, items
def Save():
with open('C:\Users\SmartDB\Desktop\2020_data_2.csv','w', newline="") as d:
writer = csv.writer(d)
n, i = Details()
writer.writerows(n)
writer.writerows(i)
Save()
在提取过程中进行网页抓取时,我得到了这个错误"simplejson.errors。JSONDecodeError:期望值:第1行第1列(字符0)"所以请帮我解决这个错误,数据出现在。csv文件中,而不是在data.csv文件中。
我猜你用pd.read_json()
请求的URL是错误的,因为这是你代码中读取json的唯一地方。
simplejson.errors.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
意味着它无法解码json,在json测试的第1行第1列(不是源代码的第1行第1列)。
很可能URL是错误的,返回的是空响应而不是json。
编辑:
如前所述,pandas.read_json()也接受一个URL。我运行了你的代码,它一直运行成功。问题可能是您从端点获得非200响应,并且响应体不是JSON。
这可能是因为您有打字错误(可能)或您的授权无效(不太可能因为我刚刚运行它并且它有效),或者您发出了太多请求并且API服务器暂时阻止了您的IP:)
也可能有许多其他原因。也许你可以在转换为JSON之前发布你的请求结果,我们可以更好地帮助你,即
r = requests.post("https://www.nmc.org.in/MCIRest/open/getDataFromService?service=getDoctorDetailsByIdImr", json=json)
j = r.json()
告诉我们r
是什么
确保你的请求的所有部分都正确输入。
您的第一行似乎试图使用pandas读取json文件,除非您刚刚传递了一个URL字符串。
table = pd.read_json("https://www.nmc.org.in/MCIRest/open/getPaginatedData?service=getPaginatedDoctor&draw=1&columns%5B0%...")['data']
这不是JSON。你必须使用GET请求从网页下载JSON,然后尝试将其作为JSON读取。
这个错误意味着pd.read_json()试图读取一些URL字符串作为JSON,当然,它不是一个。