
我有以下 numpy 数组:

import numpy as np
arr = np.array([[1,2,3,4,2000],

我了解提供结果np.sum(arr, axis=0)

array([   40,    28,    36,    34, 16012])


array([[   6,    8,   10,   12, 4000],
[  12,    4,    6,    8, 4002],
[   8,   10,   12,    4, 4004],
[  14,    6,    8,   10, 4006]])


如果必须使用 for 循环,那么它将如何工作?

我尝试了np.sum(arr[:, 4]==2000, axis=0)(我将用 for 循环中的变量替换2000(,但它给出了2

您可以使用巧妙的np.diffnp.add.reduceat应用程序在纯 numpy 中执行此操作。np.diff将为您提供最右边列更改的索引:

d = np.diff(arr[:, -1])


d = np.where(d)[0]


indices = np.r_[0, e + 1]


result = np.add.reduceat(arr, indices, axis=0)


>>> result = np.add.reduceat(arr, np.r_[0, np.where(np.diff(arr[:, -1]))[0] + 1], axis=0)
>>> result
array([[   6,    8,   10,   12, 4000],
[  12,    4,    6,    8, 4002],
[   8,   10,   12,    4, 4004],
[  14,    6,    8,   10, 4006]])


import pandas as pd
df = pd.DataFrame(arr)
x = df.groupby(4).sum().reset_index()[range(5)] #range(5) adjusts ordering 
x[4] *= 2
array([[   6,    8,   10,   12, 4000],
[  12,    4,    6,    8, 4002],
[   8,   10,   12,    4, 4004],
[  14,    6,    8,   10, 4006]])


np.array([sum(x[1]) for x in itertools.groupby(arr, key = lambda k: k[-1])])
array([[   6,    8,   10,   12, 4000],
[  12,    4,    6,    8, 4002],
[   8,   10,   12,    4, 4004],
[  14,    6,    8,   10, 4006]])



def groupbycol(a, assume_sorted_col=False, colID=-1):
if assume_sorted_col==0:
# If a is not already sorted by that col, use argsort indices for
# that colID and re-arrange rows accordingly
sidx = a[:,colID].argsort()
a_s = a[sidx] # sorted by colID col of input array
a_s = a
# Get group shifting indices
cut_idx = np.flatnonzero(np.r_[True, a_s[1:,colID] != a_s[:-1,colID]])
# Use those indices to setup sum reduction at intervals along first axis
return np.add.reduceat(a_s, cut_idx, axis=0)

示例运行 -

In [64]: arr
array([[   1,    2,    3,    4, 2000],
[   5,    6,    7,    8, 2000],
[   9,    0,    1,    2, 2001],
[   3,    4,    5,    6, 2001],
[   7,    8,    9,    0, 2002],
[   1,    2,    3,    4, 2002],
[   5,    6,    7,    8, 2003],
[   9,    0,    1,    2, 2003]])
In [65]: # Shuffle rows off input array to create a generic last col (not sorted)
...: np.random.seed(0)
...: np.random.shuffle(arr)
In [66]: arr
array([[   5,    6,    7,    8, 2003],
[   9,    0,    1,    2, 2001],
[   5,    6,    7,    8, 2000],
[   9,    0,    1,    2, 2003],
[   3,    4,    5,    6, 2001],
[   1,    2,    3,    4, 2000],
[   1,    2,    3,    4, 2002],
[   7,    8,    9,    0, 2002]])
In [67]: groupbycol(arr, assume_sorted_col=False, colID=-1)
array([[   6,    8,   10,   12, 4000],
[  12,    4,    6,    8, 4002],
[   8,   10,   12,    4, 4004],
[  14,    6,    8,   10, 4006]])


我们基本上可以用广播掩码创建 + 矩阵乘法替换该np.add.reduceat,从而利用快速 BLAS,它也适用于通用的未排序列 -

import pandas as pd
def groupbycol_matmul(a, colID=-1):
mask = pd.Series(a[:,colID]).unique()[:,None] == arr[:,colID]
return mask.dot(arr)


import numpy as np
import numpy_indexed as npi
arr = np.array([[1,2,3,4,2000],

result = npi.GroupBy(arr[:, 4]).sum(arr)[1]
>>>[[   6    8   10   12 4000]
[  12    4    6    8 4002]
[   8   10   12    4 4004]
[  14    6    8   10 4006]]
