我的数据集的一个片段看起来像:
User_id | Search_id | Price | score | clicked | company | rank
1 | 1 | 10 | 7.3 | 0 | other | 3
1 | 1 | 8 | 10.0 | 1 | other | 2
1 | 1 | 7.5 | 10.0 | 1 | us | 1
2 | 2 | 7 | 10.0 | 0 | us | 3
2 | 2 | 6.5 | 10.0 | 1 | other | 2
2 | 2 | 4 | 6.5 | 1 | other | 1
我使用的SQL查询类似于:
proc sql;
create table File1_Real as
select User_ID
, Search_ID
, mean(case when company = 'us' then rank else . end) as Our_Rank
, mean(case when company = 'us' then Price else . end) as Our_Price
, mean(case when company = 'us' then score else . end) as Our_Score
, mean(case when company = 'us' then clicked else . end) as Our_Click
, mean(case when rank <= 5 then price else . end) as Price_Top5
, mean(case when Rank = BestClickRank then price else . end) as Top_Price
, mean(case when Rank = BestClickRank then case when score= 10 then 1 else 0 end else . end) as Top_is10
, mean(BestClickRank) as Top_Rank
, min(price) as Min_Price
, count(*) as QuotesReturned
, sum(case when price < 7.5 then 1 else 0 end) as QuotesLT75
, mean(case when price < 7.5 then Score else 0 end) as LT75_Score
, sum(clicked) as TotalClicks
from (
Select *
, min(case when Clicked = 1 then Rank else . end) as BestClickRank
from work.data
where score = 10
group by user_id, search_id)
Group by 1,2
quit;
预期输出类似于:
User_id | Search_id | Our_Rank| Our_Price | Our_Score | Our_Click | Price_Top5 | Top_Price | Top_is10 | Top_Rank | Min_Price | QuotesReturned | QuotesLT75 | LT75_Score | TotalClicks
1 | 1 | 1 | 7.5 | 10.0 | 1 | 7.75 | 7.5 | 1 | 1 | 7.5 | 2 | 0 | 0 | 2
2 | 2 | 3 | 7 | 10.0 | 0 | 5.8 | 6.5 | 1 | 2 | 6.5 | 2 | 0 | 0 | 1
我试过类似的东西:
df[(df['company']!='us')].groupby(['User_id','Search_id']).agg({'price':['min','mean']})
但是:1(它不返回其余列;2(我不知道如何进行计算,返回另一列中的值,例如Our_Rank
。
有没有可能一起做这一切?我没有访问SQL pandas包的权限,所以它只能与pandas一起使用。
IIUC,您需要这样的东西:
def f(x):
d = dict()
BestClickRank = x["rank"].where(df["clicked"]==1).min()
d["Price_Top5"] = x["Price"].where(x["rank"].le(5)).mean()
d["Top_Price"] = x["Price"].where(df["rank"].eq(BestClickRank)).mean()
d["Top_is10"] = x["score"].where(x["rank"].eq(BestClickRank)).dropna().eq(10).mean()
d["Top_Rank"] = BestClickRank.mean()
d["LT75_Score"] = x["score"].where(x["Price"].lt(7.5)).mean()
return pd.Series(d)
df = data[data["score"].eq(10)]
table1 = df[df["company"].eq("us")].groupby(["User_id", "Search_id"]).agg(Our_rank=("rank","mean"),
Our_Price=("Price","mean"),
Our_Score=("score","mean"),
Our_Click=("clicked","mean"))
table2 = df.groupby(["User_id", "Search_id"]).agg(Min_Price=("Price", "min"),
Quotes_Returned=("Price","size"),
QuotesLT75=("Price", lambda x: x.lt(7.5).sum()),
TotalClicks=("clicked", "sum"))
table3 = df.groupby(["User_id", "Search_id"]).apply(f)
output = pd.concat([table1, table2, table3], axis=1)
>>> output
Our_rank Our_Price Our_Score Our_Click Min_Price Quotes_Returned QuotesLT75 TotalClicks Price_Top5 Top_Price Top_is10 Top_Rank LT75_Score
User_id Search_id
1 1 1.0 7.5 10.0 1.0 7.5 2 0 2 7.75 7.5 1.0 1.0 NaN
2 2 3.0 7.0 10.0 0.0 6.5 2 2 1 6.75 6.5 1.0 2.0 10.0
输入数据:
data = pd.DataFrame({"User_id": list(map(int, "111222")),
"Search_id": list(map(int, "111222")),
"Price": [10,8,7.5,7,6.5,4],
"score": [7.3,10,10,10,10,6.5],
"clicked": list(map(int, "011011")),
"company": ["other", "other", "us", "us", "other", "other"],
"rank": list(map(int, "321321")),
})