使用R:如何使用变进制形式的顺序函数对多个数值列(ncols)进行排序



使用顺序函数排序的一般表示法是:

myData.sorted = myData[ order(-myData[,date.idx],-myData[,(1+date.idx)]), ];

我想对可变数量的数字cols(ncols(进行排序,按照它们被传递到函数中的顺序每个都有自己的潜在方向

sortDataFrameByNumericColumns = function (ddf, mycols, direction="DESC")
{
n.cols = length(mycols);
n.dirs = length(direction);
sdf = ddf;

vecs = matrix(NA, nrow=dim(sdf)[1],ncol=n.cols);

for(i in 1:n.cols)
{
idx = which( names(sdf)== mycols[i] );
dir = if(n.dirs==1) { direction } else { direction[i]};

if(dir == "ASC")
{
vecs[,i] = sdf[,idx];
} else {  
# DESC
vecs[,i] = -sdf[,idx];
}       
}   

#########################################
## how I want it, doesn't work
#fdf = sdf[order(vecs), ];
#########################################
## non-variadic approach, does work
fdf = sdf[order( vecs[,1],vecs[,2],vecs[,3] ), ];


fdf;
}
# basic usage
mycols = c("year","week","day");
fdf = sortDataFrameByNumericColumns (ddf,mycols,"ASC");  # sort all cols ASC
md5_email year week day V01
7  1768a550126bbf820dd89edecb92895c 2008   29 207 2.6
5  15712907fc659a6714e06659256aa0a2 2009   35 244 2.6
6  3ec0f0a866eeb8e0b419cccd6ea807b5 2010    9  60 4.2
8  8f2a765187594755f64c8d11bf34a3cc 2010   10  67 3.4
10 3b87bffacdd35679a992eadf816120a2 2010   31 216 3.4
2  db539502caf70a3074ac646d21198f5a 2011   16 111 3.4
4  4ee5096244e139d1d87eeaa0bef29d71 2011   21 143 1.0
9  3605e776744be0d11583305b0ede6419 2013   40 280 4.2
1  06da8174757feffd764c7232f965cd7a 2015    4  28 3.4
3  c29e24b16f1c8c6e897b42b45dee9297 2019    2  17 5.0

# basic usage
fdf = sortDataFrameByNumericColumns (ddf,mycols,"DESC");  # sort all cols DESC

md5_email year week day V01
3  c29e24b16f1c8c6e897b42b45dee9297 2019    2  17 5.0
1  06da8174757feffd764c7232f965cd7a 2015    4  28 3.4
9  3605e776744be0d11583305b0ede6419 2013   40 280 4.2
4  4ee5096244e139d1d87eeaa0bef29d71 2011   21 143 1.0
2  db539502caf70a3074ac646d21198f5a 2011   16 111 3.4
10 3b87bffacdd35679a992eadf816120a2 2010   31 216 3.4
8  8f2a765187594755f64c8d11bf34a3cc 2010   10  67 3.4
6  3ec0f0a866eeb8e0b419cccd6ea807b5 2010    9  60 4.2
5  15712907fc659a6714e06659256aa0a2 2009   35 244 2.6
7  1768a550126bbf820dd89edecb92895c 2008   29 207 2.6
# basic usage
mydirs = c("ASC","DESC","ASC");
fdf = sortDataFrameByNumericColumns (ddf,mycols,mydirs);  # custom direction on each column ...
md5_email year week day V01
7  1768a550126bbf820dd89edecb92895c 2008   29 207 2.6
5  15712907fc659a6714e06659256aa0a2 2009   35 244 2.6
10 3b87bffacdd35679a992eadf816120a2 2010   31 216 3.4
8  8f2a765187594755f64c8d11bf34a3cc 2010   10  67 3.4
6  3ec0f0a866eeb8e0b419cccd6ea807b5 2010    9  60 4.2
4  4ee5096244e139d1d87eeaa0bef29d71 2011   21 143 1.0
2  db539502caf70a3074ac646d21198f5a 2011   16 111 3.4
9  3605e776744be0d11583305b0ede6419 2013   40 280 4.2
1  06da8174757feffd764c7232f965cd7a 2015    4  28 3.4
3  c29e24b16f1c8c6e897b42b45dee9297 2019    2  17 5.0

我使用order函数作为引擎。根据我在其他帖子上的理解,这是执行操作的最快方式。手册指出,我传递的值(当前为矩阵vecs(需要是一个向量序列。这是什么意思?

?order
... 
a sequence of numeric, complex, character or logical vectors, all of the same length, or a classed R object.

它需要一个长度相等的矢量序列。。。我有一个矩阵vecs。。。如何将它们投射到向量序列中?这是首要问题。

所以这是有效的。。。但不是变异的。

fdf = sdf[order(vecs[,1],vecs[,2],vecs[,3]), ];

如果我能以某种方式将vecs变为vecs[,1],vecs[,2],vecs[,3],那就是解决方案。我知道do.call可能是另一种方法,但我特别试图理解base::order函数的...表示法。

以下是数据帧的示例测试用例:

x = sdf[sample(1:838,10),1:5];
x
md5_email year week day V01
733 06da8174757feffd764c7232f965cd7a 2015    4  28 3.4
546 db539502caf70a3074ac646d21198f5a 2011   16 111 3.4
811 c29e24b16f1c8c6e897b42b45dee9297 2019    2  17 5.0
585 4ee5096244e139d1d87eeaa0bef29d71 2011   21 143 1.0
249 15712907fc659a6714e06659256aa0a2 2009   35 244 2.6
344 3ec0f0a866eeb8e0b419cccd6ea807b5 2010    9  60 4.2
96  1768a550126bbf820dd89edecb92895c 2008   29 207 2.6
346 8f2a765187594755f64c8d11bf34a3cc 2010   10  67 3.4
717 3605e776744be0d11583305b0ede6419 2013   40 280 4.2
410 3b87bffacdd35679a992eadf816120a2 2010   31 216 3.4

以文本格式(运行以下命令,然后Cntrl+C此文本,然后再次运行以下命令(:

"md5_email"|"year"|"week"|"day"|"V01"
"06da8174757feffd764c7232f965cd7a"|2015|4|28|3.4
"db539502caf70a3074ac646d21198f5a"|2011|16|111|3.4
"c29e24b16f1c8c6e897b42b45dee9297"|2019|2|17|5
"4ee5096244e139d1d87eeaa0bef29d71"|2011|21|143|1
"15712907fc659a6714e06659256aa0a2"|2009|35|244|2.6
"3ec0f0a866eeb8e0b419cccd6ea807b5"|2010|9|60|4.2
"1768a550126bbf820dd89edecb92895c"|2008|29|207|2.6
"8f2a765187594755f64c8d11bf34a3cc"|2010|10|67|3.4
"3605e776744be0d11583305b0ede6419"|2013|40|280|4.2
"3b87bffacdd35679a992eadf816120a2"|2010|31|216|3.4

在那里你可以从剪贴板中读取。。。

x = read.table(file = "clipboard", sep = "|", header=TRUE);

我认为do.call可以实现您想要的。

为要排序的列设置数据帧,然后应用orderdo.call。创建一个向量,根据传递的direction值乘以每列。使用返回的顺序来选择行,这些行将根据列的顺序对行进行排序。

sortDataFrameByNumericColumns <- function(ddf, mycols, direction="DESC") {
newvec <- integer(length(mycols))
newvec[direction == 'ASC'] <- 1
newvec[direction == 'DESC'] <- -1
ddf[do.call(order, sweep(ddf[cols], 2, newvec, `*`)), ]
}

在不同的输入上测试功能。

mycols = c("year","week","day")
fdf = sortDataFrameByNumericColumns (df,mycols,"ASC")
fdf
#                           md5_email year week day V01
#96  1768a550126bbf820dd89edecb92895c 2008   29 207 2.6
#249 15712907fc659a6714e06659256aa0a2 2009   35 244 2.6
#344 3ec0f0a866eeb8e0b419cccd6ea807b5 2010    9  60 4.2
#346 8f2a765187594755f64c8d11bf34a3cc 2010   10  67 3.4
#410 3b87bffacdd35679a992eadf816120a2 2010   31 216 3.4
#546 db539502caf70a3074ac646d21198f5a 2011   16 111 3.4
#585 4ee5096244e139d1d87eeaa0bef29d71 2011   21 143 1.0
#717 3605e776744be0d11583305b0ede6419 2013   40 280 4.2
#733 06da8174757feffd764c7232f965cd7a 2015    4  28 3.4
#811 c29e24b16f1c8c6e897b42b45dee9297 2019    2  17 5.0
fdf = sortDataFrameByNumericColumns (df,mycols,"DESC")
fdf
#                           md5_email year week day V01
#811 c29e24b16f1c8c6e897b42b45dee9297 2019    2  17 5.0
#733 06da8174757feffd764c7232f965cd7a 2015    4  28 3.4
#717 3605e776744be0d11583305b0ede6419 2013   40 280 4.2
#585 4ee5096244e139d1d87eeaa0bef29d71 2011   21 143 1.0
#546 db539502caf70a3074ac646d21198f5a 2011   16 111 3.4
#410 3b87bffacdd35679a992eadf816120a2 2010   31 216 3.4
#346 8f2a765187594755f64c8d11bf34a3cc 2010   10  67 3.4
#344 3ec0f0a866eeb8e0b419cccd6ea807b5 2010    9  60 4.2
#249 15712907fc659a6714e06659256aa0a2 2009   35 244 2.6
#96  1768a550126bbf820dd89edecb92895c 2008   29 207 2.6
mydirs = c("ASC","DESC","ASC")
fdf = sortDataFrameByNumericColumns (df,mycols,mydirs)
fdf
#                           md5_email year week day V01
#96  1768a550126bbf820dd89edecb92895c 2008   29 207 2.6
#249 15712907fc659a6714e06659256aa0a2 2009   35 244 2.6
#410 3b87bffacdd35679a992eadf816120a2 2010   31 216 3.4
#346 8f2a765187594755f64c8d11bf34a3cc 2010   10  67 3.4
#344 3ec0f0a866eeb8e0b419cccd6ea807b5 2010    9  60 4.2
#585 4ee5096244e139d1d87eeaa0bef29d71 2011   21 143 1.0
#546 db539502caf70a3074ac646d21198f5a 2011   16 111 3.4
#717 3605e776744be0d11583305b0ede6419 2013   40 280 4.2
#733 06da8174757feffd764c7232f965cd7a 2015    4  28 3.4
#811 c29e24b16f1c8c6e897b42b45dee9297 2019    2  17 5.0

数据

df <- structure(list(md5_email = c("06da8174757feffd764c7232f965cd7a", 
"db539502caf70a3074ac646d21198f5a", "c29e24b16f1c8c6e897b42b45dee9297", 
"4ee5096244e139d1d87eeaa0bef29d71", "15712907fc659a6714e06659256aa0a2", 
"3ec0f0a866eeb8e0b419cccd6ea807b5", "1768a550126bbf820dd89edecb92895c", 
"8f2a765187594755f64c8d11bf34a3cc", "3605e776744be0d11583305b0ede6419", 
"3b87bffacdd35679a992eadf816120a2"), year = c(2015L, 2011L, 2019L, 
2011L, 2009L, 2010L, 2008L, 2010L, 2013L, 2010L), week = c(4L, 
16L, 2L, 21L, 35L, 9L, 29L, 10L, 40L, 31L), day = c(28L, 111L, 
17L, 143L, 244L, 60L, 207L, 67L, 280L, 216L), V01 = c(3.4, 3.4, 
5, 1, 2.6, 4.2, 2.6, 3.4, 4.2, 3.4)), class = "data.frame", row.names = c("733", 
"546", "811", "585", "249", "344", "96", "346", "717", "410"))

最新更新