基于布尔列计算滚动最大值列



我试图在不使用for循环(使用OHLC库存数据(的情况下执行新的列计算,该循环表示基于另一列中的True值的滚动最高点。我已经表示了一个";交易的第一小时";时间窗口在9:30到10:30之间。在此期间,当我们遍历行时,如果"first hour of trading" == True,并且当前High为>=上一个高点,然后使用新的高点。如果我们不在";交易的第一小时";,只需将之前的值向前推即可。

因此,使用这个可复制的数据集:

import numpy as np
import pandas as pd
from datetime import datetime
# Create a reproducible, static dataframe.
# 1 minute SPY data. Skip to the bottom...
df = pd.DataFrame([
{
"time": "2021-10-26 9:30",
"open": "457.2",
"high": "457.29",
"low": "456.78",
"close": "456.9383",
"volume": "594142"
},
{
"time": "2021-10-26 9:31",
"open": "456.94",
"high": "457.07",
"low": "456.8",
"close": "456.995",
"volume": "194061"
},
{
"time": "2021-10-26 9:32",
"open": "456.99",
"high": "457.22",
"low": "456.84",
"close": "457.21",
"volume": "186114"
},
{
"time": "2021-10-26 9:33",
"open": "457.22",
"high": "457.45",
"low": "457.2011",
"close": "457.308",
"volume": "294158"
},
{
"time": "2021-10-26 9:34",
"open": "457.31",
"high": "457.4",
"low": "457.25",
"close": "457.32",
"volume": "172574"
},
{
"time": "2021-10-26 9:35",
"open": "457.31",
"high": "457.48",
"low": "457.18",
"close": "457.44",
"volume": "396668"
},
{
"time": "2021-10-26 9:36",
"open": "457.48",
"high": "457.6511",
"low": "457.44",
"close": "457.57",
"volume": "186777"
},
{
"time": "2021-10-26 9:37",
"open": "457.5699",
"high": "457.73",
"low": "457.5699",
"close": "457.69",
"volume": "187596"
},
{
"time": "2021-10-26 9:38",
"open": "457.7",
"high": "457.73",
"low": "457.54",
"close": "457.63",
"volume": "185570"
},
{
"time": "2021-10-26 9:39",
"open": "457.63",
"high": "457.64",
"low": "457.31",
"close": "457.59",
"volume": "164707"
},
{
"time": "2021-10-26 9:40",
"open": "457.59",
"high": "457.72",
"low": "457.46",
"close": "457.7199",
"volume": "167438"
},
{
"time": "2021-10-26 9:41",
"open": "457.72",
"high": "457.8",
"low": "457.68",
"close": "457.72",
"volume": "199951"
},
{
"time": "2021-10-26 9:42",
"open": "457.73",
"high": "457.74",
"low": "457.6",
"close": "457.62",
"volume": "152134"
},
{
"time": "2021-10-26 9:43",
"open": "457.6",
"high": "457.65",
"low": "457.45",
"close": "457.5077",
"volume": "142530"
},
{
"time": "2021-10-26 9:44",
"open": "457.51",
"high": "457.64",
"low": "457.4001",
"close": "457.61",
"volume": "122575"
},
{
"time": "2021-10-26 9:45",
"open": "457.61",
"high": "457.76",
"low": "457.58",
"close": "457.75",
"volume": "119886"
},
{
"time": "2021-10-26 9:46",
"open": "457.74",
"high": "457.75",
"low": "457.37",
"close": "457.38",
"volume": "183157"
},
{
"time": "2021-10-26 9:47",
"open": "457.42",
"high": "457.49",
"low": "457.37",
"close": "457.44",
"volume": "128542"
},
{
"time": "2021-10-26 9:48",
"open": "457.43",
"high": "457.49",
"low": "457.33",
"close": "457.44",
"volume": "154181"
},
{
"time": "2021-10-26 9:49",
"open": "457.43",
"high": "457.5898",
"low": "457.42",
"close": "457.47",
"volume": "163063"
},
{
"time": "2021-10-26 9:50",
"open": "457.45",
"high": "457.59",
"low": "457.44",
"close": "457.555",
"volume": "96229"
},
{
"time": "2021-10-26 9:51",
"open": "457.56",
"high": "457.61",
"low": "457.31",
"close": "457.4217",
"volume": "110380"
},
{
"time": "2021-10-26 9:52",
"open": "457.42",
"high": "457.56",
"low": "457.42",
"close": "457.47",
"volume": "107518"
},
{
"time": "2021-10-26 9:53",
"open": "457.475",
"high": "457.51",
"low": "457.4",
"close": "457.48",
"volume": "78062"
},
{
"time": "2021-10-26 9:54",
"open": "457.49",
"high": "457.57",
"low": "457.42",
"close": "457.46",
"volume": "133883"
},
{
"time": "2021-10-26 9:55",
"open": "457.47",
"high": "457.56",
"low": "457.45",
"close": "457.51",
"volume": "98998"
},
{
"time": "2021-10-26 9:56",
"open": "457.51",
"high": "457.54",
"low": "457.43",
"close": "457.43",
"volume": "110237"
},
{
"time": "2021-10-26 9:57",
"open": "457.43",
"high": "457.65",
"low": "457.375",
"close": "457.65",
"volume": "98794"
},
{
"time": "2021-10-26 9:58",
"open": "457.66",
"high": "457.69",
"low": "457.35",
"close": "457.45",
"volume": "262154"
},
{
"time": "2021-10-26 9:59",
"open": "457.45",
"high": "457.47",
"low": "457.33",
"close": "457.4",
"volume": "74685"
},
{
"time": "2021-10-26 10:00",
"open": "457.41",
"high": "457.48",
"low": "457.18",
"close": "457.38",
"volume": "166617"
},
{
"time": "2021-10-26 10:01",
"open": "457.39",
"high": "457.7",
"low": "457.39",
"close": "457.5",
"volume": "265649"
},
{
"time": "2021-10-26 10:02",
"open": "457.51",
"high": "457.57",
"low": "457.39",
"close": "457.53",
"volume": "131947"
},
{
"time": "2021-10-26 10:03",
"open": "457.53",
"high": "457.54",
"low": "457.4",
"close": "457.51",
"volume": "80111"
},
{
"time": "2021-10-26 10:04",
"open": "457.51",
"high": "457.62",
"low": "457.5",
"close": "457.6101",
"volume": "117174"
},
{
"time": "2021-10-26 10:05",
"open": "457.621",
"high": "457.64",
"low": "457.51",
"close": "457.58",
"volume": "168758"
},
{
"time": "2021-10-26 10:06",
"open": "457.58",
"high": "457.64",
"low": "457.46",
"close": "457.61",
"volume": "84076"
},
{
"time": "2021-10-26 10:07",
"open": "457.62",
"high": "457.7401",
"low": "457.62",
"close": "457.66",
"volume": "125156"
},
{
"time": "2021-10-26 10:08",
"open": "457.665",
"high": "457.69",
"low": "457.5",
"close": "457.67",
"volume": "116919"
},
{
"time": "2021-10-26 10:09",
"open": "457.69",
"high": "457.72",
"low": "457.5",
"close": "457.57",
"volume": "102551"
},
{
"time": "2021-10-26 10:10",
"open": "457.56",
"high": "457.75",
"low": "457.56",
"close": "457.7",
"volume": "109165"
},
{
"time": "2021-10-26 10:11",
"open": "457.7",
"high": "457.725",
"low": "457.63",
"close": "457.66",
"volume": "146209"
},
{
"time": "2021-10-26 10:12",
"open": "457.665",
"high": "457.88",
"low": "457.64",
"close": "457.86",
"volume": "210620"
},
{
"time": "2021-10-26 10:13",
"open": "457.855",
"high": "457.96",
"low": "457.83",
"close": "457.95",
"volume": "159975"
},
{
"time": "2021-10-26 10:14",
"open": "457.95",
"high": "458.02",
"low": "457.93",
"close": "457.95",
"volume": "152042"
},
{
"time": "2021-10-26 10:15",
"open": "457.96",
"high": "458.15",
"low": "457.96",
"close": "458.08",
"volume": "146047"
},
{
"time": "2021-10-26 10:16",
"open": "458.085",
"high": "458.17",
"low": "457.99",
"close": "458.15",
"volume": "100732"
},
{
"time": "2021-10-26 10:17",
"open": "458.17",
"high": "458.33",
"low": "458.155",
"close": "458.245",
"volume": "235072"
},
{
"time": "2021-10-26 10:18",
"open": "458.25",
"high": "458.29",
"low": "458.14",
"close": "458.16",
"volume": "422002"
},
{
"time": "2021-10-26 10:19",
"open": "458.17",
"high": "458.2801",
"low": "458.1699",
"close": "458.28",
"volume": "114611"
},
{
"time": "2021-10-26 10:20",
"open": "458.29",
"high": "458.39",
"low": "458.24",
"close": "458.37",
"volume": "241797"
},
{
"time": "2021-10-26 10:21",
"open": "458.37",
"high": "458.42",
"low": "458.31",
"close": "458.345",
"volume": "124824"
},
{
"time": "2021-10-26 10:22",
"open": "458.33",
"high": "458.49",
"low": "458.33",
"close": "458.47",
"volume": "132125"
}
])

这个代码可以低于上面的df:

# Convert df to numeric and time to datetime re: the .csv to .json
# converter tool I used online...
df[['open','high','low','close','volume']] = df[['open','high','low','close','volume']].apply(pd.to_numeric)
df['time'] = pd.to_datetime(df['time'])
# When will the first hour of trading be?
startOfFirstHourTrading = datetime.strptime("0930", "%H%M").time()
endOfFirstHourTrading = datetime.strptime("1030", "%H%M").time()
# ...then using those times, define a boolean column denoting whether
# we're currently in the first hour of trading or not
df['isFirstHourTrading'] = np.where((df['time'].dt.time >= startOfFirstHourTrading) & (df['time'].dt.time < endOfFirstHourTrading), True, False)
df['FirstHourHigh'] = 0
# WORKING EXAMPLE
# Iterate through the df
for i in range(len(df)):
# If we're not currently within the first hour of trading, just
# bring forward the last value
if df['isFirstHourTrading'].iloc[i] == False:
df['FirstHourHigh'].iloc[i] = df['FirstHourHigh'].iloc[i-1]
continue
# ... otherwise if we ARE in the first hour of trading, keep track
# of the rolling highest high during the first hour
df['FirstHourHigh'].iloc[i] = max(df['high'].iloc[i], df['high'].iloc[i-1])
# Export the correct answer dataset to compare the next function
# to below
df.to_csv("correct_answers.csv", index=False)
# NON-WORKING EXAMPLE
# What I'd like to do is NOT use a for loop to do the above. I envision
# we can use np.where() and maybe a groupby() here? But I don't know how yet. 
# Psuedo might look something like:
df['FirstHourHigh'] = df['high'].rolling(window=DYNAMIC_START_OF_FIRST_HOUR_TRADING_ROW?).max()
# ... but obviously doesn't work, and doesn't take into account if if df['isFirstHourTrading'] == True which it should
# Cross check for matches with correct_answers.csv
df.to_csv("correct_answers2.csv", index=False)

您将看到,我已经使用for循环实现了我想要的内容,以便将我想要的与之后的内容进行比较。我希望实现与for循环相同的功能,但不使用for循环,只使用1行列计算。有什么想法吗?谢谢

首先将时间作为时间,然后屏蔽第一个交易小时的值,获得它们的累积最大值,并将最后(因此是最高的(值向前推:

>>> df['time'] = pd.to_datetime(df['time'])
>>> first_hour = (df['time'].dt.hour * 60 + df['time'].dt.minute).between(9.5 * 60, 10.5 * 60)
>>> df['high'].where(first_hour).cummax().ffill()

以下是它在一个较小的例子中的作用,该例子实际上也有第一个订单的几天和日期:

>>> df
time  high
0 2021-10-26 09:30:00   525
1 2021-10-26 10:00:00   504
2 2021-10-26 10:30:00   550
3 2021-10-26 11:00:00   567
4 2021-10-27 09:30:00   520
5 2021-10-27 10:00:00   576
6 2021-10-27 10:30:00   508
7 2021-10-27 11:00:00   532
>>> df['high'].where((df['time'].dt.hour * 60 + df['time'].dt.minute).between(9.5 * 60, 10.5 * 60)).cummax().ffill()
0    525.0
1    525.0
2    550.0
3    550.0
4    550.0
5    576.0
6    576.0
7    576.0

首先,您可能有很多数据不符合"first hour of trading" == True的标准,对吧?如果是这样的话,我们可以从过滤掉这些数据开始,这样我们就不需要处理其他情况了。

firstHourTrading = df[df["isFirstHourTrading"] == True]

如果这不是一个选择,那也没关系。我们可以通过使用apply和使用与for循环中使用的算法相同的前一行来立即提高循环的性能。

df.apply(lambda row: your_business_logic(row, row.shift()))

或者,如果符合您的需求,您可以尝试使用Panda的cummax功能:https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.cummax.html

pandas cummax函数可以在isFirstHourTrading的高位列上使用,如下所示:

df["FirstHourHigh"] = (df["isFirstHourTrading"]*df["high"]).cummax()

最新更新