


import numpy as np
X = np.arange(16).reshape(8,2)
y = np.arange(8)
groups = np.array([0,0,0,0,1,1,1,1])




# create data
import numpy as np
# user settings
sample_size = 100
group_probablities = [0.6,0.4]
rng = np.random.RandomState(42)
X = np.arange(sample_size*2).reshape(sample_size,2)
y = np.arange(sample_size)
groups = rng.choice(a=[1,2],size=sample_size,p=group_probablities)
groups = np.sort(groups)
# user settings
n_splits = 4
def grouped_kfold_subsetted(groups,n_splits):
# get unique group labels (because they don't always have to be 0 and 1) and
# count how often a certain group is present
unique_groups,group_counts = np.unique(groups, return_counts=True)

# get the size of the smallest group. This will determine how big/small 
# a fold can maximally get
smallest_group_size = np.min(group_counts)
fold_size = smallest_group_size // n_splits

if n_splits > smallest_group_size:
raise ValueError('Number of folds must not be greater than the number of samples in the smallest group')
if fold_size == 1:
raise ValueError('Training folds must contain at least two samples. Choose a smaller n_splits to increase fold size')

train_and_test_idxs = []
group_switch = 0
fold_start = 0

for split in range(n_splits):

# decide which of the two groups will form the train and the test fold
train_group = unique_groups[group_switch]
test_group = unique_groups[~group_switch]

# get all training and testing indices (we will subset afterwards)
train_idxs = np.where(groups == train_group)[0] 
test_idxs = np.where(groups == test_group)[0]

# subset training idxs chosen fold size
fold_end = fold_start + fold_size
train_idxs = train_idxs[fold_start:fold_end]

# Optional: Make test set equally large as training set
# test_idxs = test_idxs[fold_start:fold_end]

# in the next cycle the other group will from the train fold
group_switch = 1 - group_switch

# this has to happen every 2nd cycle because each of the groups 
# wil a train fold (so we start )
if split % 2 != 0 and split != 0:
fold_start = fold_end

return train_and_test_idxs
train_and_test_idxs = grouped_kfold_subsetted(groups, n_splits)


  • 没有找到相关文章
