


sorted_df = df.orderBy(asc(date))

我现在需要一个函数,该函数将在数据框中的前30%的数据行中找到并然后在新列中创建一个标志,因此在本例中,它将是前2100行(7000 * 0.3)。然后,我想改进这个函数,为落入40%,50%,60%事务括号的行添加额外的标志问题的下一部分是能够将其应用于数据中的一组不同月份(对于上面的df,我已将其过滤为一个月的数据,以使其更容易应用)。我被困在这里,因为我是新的创建功能,并希望以此为学习的机会。多谢


def flag_dataframe(df):
df = df.reset_index() #to make sure the row index its still in the right order
df.insert(len(df.columns), 'Flag', None) #create column flag
flags = [30,40,50,60,70,80,90,100] #the flag percentages
for i, row in df.iterrows(): #iterate through the dataframe, i is the index of the row, which is reset on the second line
for flag in flags: 
if(i / len(df) * 100 <= flag): #check which flag is the right flag
df.loc[i, "flag"] = f"{flag}%" #setting the flag value of this row
break #break out of this loop so it wont override the flag value for another one
return df


df = flag_dataframe(df)



def flag_dataframe_by_month(df):
df = df.reset_index() # to make sure the row index its still in the right order
df.insert(len(df.columns), 'Flag', None) #create column flag
flags = [30,40,50,60,70,80,90,100] #the flag percentages
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
for month in months:
month_df = df[df["Month"] == month] #this will select all the rows from a month, but keep the index that is set on line 2
month_df.insert(len(month_df.columns), 'month_rec_index', [i for i in range(len(month_df))]) #this will create an index based on the number of records with the same month, this index will not be used in the result
for i, row in month_df.iterrows(): #iterate through the records with the same month, i is the index of the row in the original dataframe, which is set on line 2
for flag in flags: 
if(row["month_rec_index"] / len(month_df) * 100 < flag): #check which flag is the right flag
df.loc[i, "Flag"] = f"{flag}%" #setting the flag value of this row in the original dataframe
break #break out of this loop so it wont override the flag value for another one
return df.drop(columns=["index"]) #pandas creates a second index, I dont exactly know why, but this is how to remove it again.


