使用顺序函数排序的一般表示法是:
myData.sorted = myData[ order(-myData[,date.idx],-myData[,(1+date.idx)]), ];
我想对可变数量的数字cols(ncols(进行排序,按照它们被传递到函数中的顺序每个都有自己的潜在方向
sortDataFrameByNumericColumns = function (ddf, mycols, direction="DESC")
{
n.cols = length(mycols);
n.dirs = length(direction);
sdf = ddf;
vecs = matrix(NA, nrow=dim(sdf)[1],ncol=n.cols);
for(i in 1:n.cols)
{
idx = which( names(sdf)== mycols[i] );
dir = if(n.dirs==1) { direction } else { direction[i]};
if(dir == "ASC")
{
vecs[,i] = sdf[,idx];
} else {
# DESC
vecs[,i] = -sdf[,idx];
}
}
#########################################
## how I want it, doesn't work
#fdf = sdf[order(vecs), ];
#########################################
## non-variadic approach, does work
fdf = sdf[order( vecs[,1],vecs[,2],vecs[,3] ), ];
fdf;
}
# basic usage
mycols = c("year","week","day");
fdf = sortDataFrameByNumericColumns (ddf,mycols,"ASC"); # sort all cols ASC
md5_email year week day V01
7 1768a550126bbf820dd89edecb92895c 2008 29 207 2.6
5 15712907fc659a6714e06659256aa0a2 2009 35 244 2.6
6 3ec0f0a866eeb8e0b419cccd6ea807b5 2010 9 60 4.2
8 8f2a765187594755f64c8d11bf34a3cc 2010 10 67 3.4
10 3b87bffacdd35679a992eadf816120a2 2010 31 216 3.4
2 db539502caf70a3074ac646d21198f5a 2011 16 111 3.4
4 4ee5096244e139d1d87eeaa0bef29d71 2011 21 143 1.0
9 3605e776744be0d11583305b0ede6419 2013 40 280 4.2
1 06da8174757feffd764c7232f965cd7a 2015 4 28 3.4
3 c29e24b16f1c8c6e897b42b45dee9297 2019 2 17 5.0
# basic usage
fdf = sortDataFrameByNumericColumns (ddf,mycols,"DESC"); # sort all cols DESC
md5_email year week day V01
3 c29e24b16f1c8c6e897b42b45dee9297 2019 2 17 5.0
1 06da8174757feffd764c7232f965cd7a 2015 4 28 3.4
9 3605e776744be0d11583305b0ede6419 2013 40 280 4.2
4 4ee5096244e139d1d87eeaa0bef29d71 2011 21 143 1.0
2 db539502caf70a3074ac646d21198f5a 2011 16 111 3.4
10 3b87bffacdd35679a992eadf816120a2 2010 31 216 3.4
8 8f2a765187594755f64c8d11bf34a3cc 2010 10 67 3.4
6 3ec0f0a866eeb8e0b419cccd6ea807b5 2010 9 60 4.2
5 15712907fc659a6714e06659256aa0a2 2009 35 244 2.6
7 1768a550126bbf820dd89edecb92895c 2008 29 207 2.6
# basic usage
mydirs = c("ASC","DESC","ASC");
fdf = sortDataFrameByNumericColumns (ddf,mycols,mydirs); # custom direction on each column ...
md5_email year week day V01
7 1768a550126bbf820dd89edecb92895c 2008 29 207 2.6
5 15712907fc659a6714e06659256aa0a2 2009 35 244 2.6
10 3b87bffacdd35679a992eadf816120a2 2010 31 216 3.4
8 8f2a765187594755f64c8d11bf34a3cc 2010 10 67 3.4
6 3ec0f0a866eeb8e0b419cccd6ea807b5 2010 9 60 4.2
4 4ee5096244e139d1d87eeaa0bef29d71 2011 21 143 1.0
2 db539502caf70a3074ac646d21198f5a 2011 16 111 3.4
9 3605e776744be0d11583305b0ede6419 2013 40 280 4.2
1 06da8174757feffd764c7232f965cd7a 2015 4 28 3.4
3 c29e24b16f1c8c6e897b42b45dee9297 2019 2 17 5.0
我使用order
函数作为引擎。根据我在其他帖子上的理解,这是执行操作的最快方式。手册指出,我传递的值(当前为矩阵vecs
(需要是一个向量序列。这是什么意思?
?order
...
a sequence of numeric, complex, character or logical vectors, all of the same length, or a classed R object.
它需要一个长度相等的矢量序列。。。我有一个矩阵vecs
。。。如何将它们投射到向量序列中?这是首要问题。
所以这是有效的。。。但不是变异的。
fdf = sdf[order(vecs[,1],vecs[,2],vecs[,3]), ];
如果我能以某种方式将vecs
变为vecs[,1],vecs[,2],vecs[,3]
,那就是解决方案。我知道do.call
可能是另一种方法,但我特别试图理解base::order
函数的...
表示法。
以下是数据帧的示例测试用例:
x = sdf[sample(1:838,10),1:5];
x
md5_email year week day V01
733 06da8174757feffd764c7232f965cd7a 2015 4 28 3.4
546 db539502caf70a3074ac646d21198f5a 2011 16 111 3.4
811 c29e24b16f1c8c6e897b42b45dee9297 2019 2 17 5.0
585 4ee5096244e139d1d87eeaa0bef29d71 2011 21 143 1.0
249 15712907fc659a6714e06659256aa0a2 2009 35 244 2.6
344 3ec0f0a866eeb8e0b419cccd6ea807b5 2010 9 60 4.2
96 1768a550126bbf820dd89edecb92895c 2008 29 207 2.6
346 8f2a765187594755f64c8d11bf34a3cc 2010 10 67 3.4
717 3605e776744be0d11583305b0ede6419 2013 40 280 4.2
410 3b87bffacdd35679a992eadf816120a2 2010 31 216 3.4
以文本格式(运行以下命令,然后Cntrl+C此文本,然后再次运行以下命令(:
"md5_email"|"year"|"week"|"day"|"V01"
"06da8174757feffd764c7232f965cd7a"|2015|4|28|3.4
"db539502caf70a3074ac646d21198f5a"|2011|16|111|3.4
"c29e24b16f1c8c6e897b42b45dee9297"|2019|2|17|5
"4ee5096244e139d1d87eeaa0bef29d71"|2011|21|143|1
"15712907fc659a6714e06659256aa0a2"|2009|35|244|2.6
"3ec0f0a866eeb8e0b419cccd6ea807b5"|2010|9|60|4.2
"1768a550126bbf820dd89edecb92895c"|2008|29|207|2.6
"8f2a765187594755f64c8d11bf34a3cc"|2010|10|67|3.4
"3605e776744be0d11583305b0ede6419"|2013|40|280|4.2
"3b87bffacdd35679a992eadf816120a2"|2010|31|216|3.4
在那里你可以从剪贴板中读取。。。
x = read.table(file = "clipboard", sep = "|", header=TRUE);
我认为do.call
可以实现您想要的。
为要排序的列设置数据帧,然后应用order
和do.call
。创建一个向量,根据传递的direction
值乘以每列。使用返回的顺序来选择行,这些行将根据列的顺序对行进行排序。
sortDataFrameByNumericColumns <- function(ddf, mycols, direction="DESC") {
newvec <- integer(length(mycols))
newvec[direction == 'ASC'] <- 1
newvec[direction == 'DESC'] <- -1
ddf[do.call(order, sweep(ddf[cols], 2, newvec, `*`)), ]
}
在不同的输入上测试功能。
mycols = c("year","week","day")
fdf = sortDataFrameByNumericColumns (df,mycols,"ASC")
fdf
# md5_email year week day V01
#96 1768a550126bbf820dd89edecb92895c 2008 29 207 2.6
#249 15712907fc659a6714e06659256aa0a2 2009 35 244 2.6
#344 3ec0f0a866eeb8e0b419cccd6ea807b5 2010 9 60 4.2
#346 8f2a765187594755f64c8d11bf34a3cc 2010 10 67 3.4
#410 3b87bffacdd35679a992eadf816120a2 2010 31 216 3.4
#546 db539502caf70a3074ac646d21198f5a 2011 16 111 3.4
#585 4ee5096244e139d1d87eeaa0bef29d71 2011 21 143 1.0
#717 3605e776744be0d11583305b0ede6419 2013 40 280 4.2
#733 06da8174757feffd764c7232f965cd7a 2015 4 28 3.4
#811 c29e24b16f1c8c6e897b42b45dee9297 2019 2 17 5.0
fdf = sortDataFrameByNumericColumns (df,mycols,"DESC")
fdf
# md5_email year week day V01
#811 c29e24b16f1c8c6e897b42b45dee9297 2019 2 17 5.0
#733 06da8174757feffd764c7232f965cd7a 2015 4 28 3.4
#717 3605e776744be0d11583305b0ede6419 2013 40 280 4.2
#585 4ee5096244e139d1d87eeaa0bef29d71 2011 21 143 1.0
#546 db539502caf70a3074ac646d21198f5a 2011 16 111 3.4
#410 3b87bffacdd35679a992eadf816120a2 2010 31 216 3.4
#346 8f2a765187594755f64c8d11bf34a3cc 2010 10 67 3.4
#344 3ec0f0a866eeb8e0b419cccd6ea807b5 2010 9 60 4.2
#249 15712907fc659a6714e06659256aa0a2 2009 35 244 2.6
#96 1768a550126bbf820dd89edecb92895c 2008 29 207 2.6
mydirs = c("ASC","DESC","ASC")
fdf = sortDataFrameByNumericColumns (df,mycols,mydirs)
fdf
# md5_email year week day V01
#96 1768a550126bbf820dd89edecb92895c 2008 29 207 2.6
#249 15712907fc659a6714e06659256aa0a2 2009 35 244 2.6
#410 3b87bffacdd35679a992eadf816120a2 2010 31 216 3.4
#346 8f2a765187594755f64c8d11bf34a3cc 2010 10 67 3.4
#344 3ec0f0a866eeb8e0b419cccd6ea807b5 2010 9 60 4.2
#585 4ee5096244e139d1d87eeaa0bef29d71 2011 21 143 1.0
#546 db539502caf70a3074ac646d21198f5a 2011 16 111 3.4
#717 3605e776744be0d11583305b0ede6419 2013 40 280 4.2
#733 06da8174757feffd764c7232f965cd7a 2015 4 28 3.4
#811 c29e24b16f1c8c6e897b42b45dee9297 2019 2 17 5.0
数据
df <- structure(list(md5_email = c("06da8174757feffd764c7232f965cd7a",
"db539502caf70a3074ac646d21198f5a", "c29e24b16f1c8c6e897b42b45dee9297",
"4ee5096244e139d1d87eeaa0bef29d71", "15712907fc659a6714e06659256aa0a2",
"3ec0f0a866eeb8e0b419cccd6ea807b5", "1768a550126bbf820dd89edecb92895c",
"8f2a765187594755f64c8d11bf34a3cc", "3605e776744be0d11583305b0ede6419",
"3b87bffacdd35679a992eadf816120a2"), year = c(2015L, 2011L, 2019L,
2011L, 2009L, 2010L, 2008L, 2010L, 2013L, 2010L), week = c(4L,
16L, 2L, 21L, 35L, 9L, 29L, 10L, 40L, 31L), day = c(28L, 111L,
17L, 143L, 244L, 60L, 207L, 67L, 280L, 216L), V01 = c(3.4, 3.4,
5, 1, 2.6, 4.2, 2.6, 3.4, 4.2, 3.4)), class = "data.frame", row.names = c("733",
"546", "811", "585", "249", "344", "96", "346", "717", "410"))