我有以下数据帧:
df = dput(sent_results_mp)
structure(list(Date = structure(c(11690, 11725, 11753, 11781,
11809, 11844, 11872, 11942, 11970, 11998, 12026, 12061, 12089,
12117, 12145, 12180, 12208, 12243, 12299, 12327, 12362, 12390,
12425, 12453, 12481, 12509, 12544, 12572, 12600, 12663, 12698,
12726, 12754, 12796, 12817, 12845, 12880, 12907, 12936, 12971,
13027, 13062, 13090, 13118, 13160, 13181, 13209, 13244, 13272,
13307, 13335, 13363, 13391, 13426, 13454, 13489, 13524, 13552,
13580, 13615, 13643, 13670, 13699, 13762, 13790, 13825, 13853,
13888, 13916, 13944, 13979, 14007, 14035, 14063, 14098, 14126,
14154, 14189, 14217, 14259, 14280, 14308, 14336, 14371, 14399,
14427, 14462, 14490, 14525, 14553, 14581, 14623, 14644, 14672,
14707, 14735, 14770, 14798, 14826, 14854, 14889, 14917, 14945,
14987, 15008, 15036, 15071, 15099, 15134, 15162, 15190, 15225,
15253, 15281, 15316, 15351, 15379, 15407, 15434, 15463, 15497,
15526, 15554, 15589, 15617, 15652, 15680, 15715, 15743, 15771,
15799, 15827, 15862, 15890, 15918, 15953, 15980, 16016, 16044,
16079, 16107, 16135, 16163, 16198, 16226, 16254, 16289, 16317,
16345, 16380, 16408, 16457, 16499, 16540, 16589, 16632, 16681,
16730, 16772, 16821, 16870, 16912, 16954, 17003, 17052, 17094,
17143, 17185, 17234, 17283, 17325, 17367, 17416, 17465, 17514,
17556, 17598, 17647, 17696, 17738, 17787, 17829, 17878, 17920,
17962, 17996, 18053, 18102, 18151, 18193, 18242, 18284, 18333,
18382, 18417, 18459, 18515, 18564, 18606), class = "Date"), Sentiment = c(0,
0, 0, 0, -0.0957529552593197, -0.239862173958995, 0, 0, 0, -0.188924269202003,
-0.091167278669553, 0, -0.135208666258976, -0.234113624774356,
-0.212585339838951, -0.105651305696646, -0.222774417797656, -0.159894235955241,
-0.138472470255942, -0.0585150174036769, 0.103711250566817, 0.22031952297964,
0.145250201234944, -0.0323994910462342, 0.111593250830965, 0.0295247989691986,
0, 0.212482042926666, 0.158605081733718, 0.191920529302748, 0.173414462996843,
0.216876858126887, 0.20756261971968, 0.154883876782037, 0.149137898593547,
0.142165761354657, 0.121643683652311, 0.143972172018566, 0.0657414955655748,
-0.0324914378153873, 0.22264327839293, 0.185010294208908, 0.200374287833188,
0.0673440137703783, 0.0929770244712388, 0.113554686221999, 0.182856427591615,
0.0934514885524806, 0.183617393098071, 0.0937019320979792, 0.159373306262786,
0.189215896294599, 0.112404940944111, 0.148174048572046, 0.141732287695358,
0.110064057625983, 0.0551168195596121, 0.123436755747825, 0.133480320068247,
0.101616646400401, 0.109118646472558, 0.13840615378245, 0.063620964728031,
-0.0929040604032753, 0.0213563501297979, 0.0477819585658106,
0.0521622996105026, 0.116380748001251, 0.0145928432680972, 0.130906487222809,
0.242361449110559, 0.122366901526266, 0.189549946862169, 0.121341708778681,
0.0275950455528882, 0.115830049217305, -0.00268196934335387,
-0.268667095406521, -0.270654675922787, -0.345097118931143, -0.201666970593853,
-0.197870820068962, -0.183318344286775, -0.256780314062594, -0.164952289086926,
-0.203040493784049, -0.0953052320370853, -0.0564353925542141,
-0.0595365444481821, -0.0391431509940865, -0.158559094494205,
-0.103666510741687, -0.0341276356813399, -0.145250783683547,
-0.0348102368055625, -0.11164828986622, -0.153872166218408, -0.172037964130696,
-0.143868340198388, -0.253504128423768, -0.161731136069997, -0.0757134096682664,
-0.24850284019793, -0.0241736860282622, -0.0497985627029049,
0.0267362983301218, -0.0288807405823324, -0.0216738497959909,
-0.0547198276698082, -0.0713004669575829, -0.179452611448087,
-0.0882044859593271, -0.262589845197242, -0.277194378086572,
-0.264553715711326, -0.259524252590977, -0.196382049608858, -0.0759425223156328,
-0.0361527200578723, -0.106850259008264, -0.286292773362804,
-0.25954572063577, -0.28324826826043, -0.290781100752149, -0.221078940043079,
-0.203717181838236, -0.156778915779193, -0.0518897691822025,
-0.192229960033653, -0.170680784237861, -0.270654390695637, -0.305547637220867,
-0.077830807264447, -0.142463997328291, -0.111816767186701, -0.0969791693314262,
-0.0977361569524874, -0.273767496166023, -0.25730447171265, -0.271029007126539,
-0.252917481283751, -0.242632588726111, -0.0811418391092237,
-0.215566014069223, -0.362387647325206, -0.270342963775206, -0.314850509295431,
-0.337486756477923, -0.285711580880554, -0.340974038135234, -0.331945731128118,
-0.304930629110979, -0.150278570470029, -0.283028615895731, -0.277429801826168,
-0.185458849789886, -0.277688809057794, -0.22402331728913, -0.375964402684551,
-0.255242751090043, -0.344736427759783, -0.309107710644531, -0.293704494677478,
-0.243479266198671, -0.245051446889861, -0.231687103050292, -0.350613730820339,
-0.169486098633503, -0.311849636896508, -0.223447284729041, -0.119431392144917,
-0.275098935303954, -0.22189613629342, -0.299753093156644, -0.213188459452649,
-0.194131767679836, -0.293604768058008, -0.235407074042252, -0.304132799905395,
-0.34748336796163, -0.362535972556536, -0.321477097322425, -0.364037876416921,
-0.313450665525883, -0.294676350612345, -0.254149309015538, -0.232363154687674,
-0.313000245989788, -0.326964317594564, -0.26214055036952, -0.268116079703687,
-0.267480139301619, -0.319864093982952, -0.360481609256352, -0.314464988256011,
-0.298918676729976, -0.278082463263552, -0.270358321007133, -0.328174016516938
)), class = "data.frame", row.names = c(NA, -199L))
此数据帧的频率不规则。使用以下代码,您可以看到该系列中缺失的月份,这些月份本可以成为每月一次的:
missingMonths <- lapply(split(df,format(as.Date(df$Date),"%Y")),
function(x) month.name[setdiff(seq(12),as.numeric(format(as.Date(x$Date),"%m")))])
missingMonths
# this is from 2015 to 2020
$`2015`
[1] "February" "May" "August" "November"
$`2016`
[1] "February" "May" "August" "November"
$`2017`
[1] "February" "May" "August" "November"
$`2018`
[1] "February" "May" "August" "November"
$`2019`
[1] "February" "May" "August" "November"
$`2020`
[1] "February" "May" "August" "November"
我只打印了2015年至2020年缺失的几个月,因为它们显示了一种模式,即总是出现相同的四个月。我想做的是,把前几年同样的几个月里的观察结果也弄丢。让我举一个例子。这是2014年的系列,没有遗漏的几个月:
Date Sentiment
140 2014-01-09 -0.27102901
141 2014-02-06 -0.25291748
142 2014-03-06 -0.24263259
143 2014-04-03 -0.08114184
144 2014-05-08 -0.21556601
145 2014-06-05 -0.36238765
146 2014-07-03 -0.27034296
147 2014-08-07 -0.31485051
148 2014-09-04 -0.33748676
149 2014-10-02 -0.28571158
150 2014-11-06 -0.34097404
151 2014-12-04 -0.33194573
# Yet, I want to remove the observations for February, May, August and November by taking the mean with the previous month. I would get:
Date Sentiment
140 2014-01-09 # this should be the mean between jan and feb
142 2014-03-06 -0.24263259
143 2014-04-03 # this should be the mean of april and may
145 2014-06-05 -0.36238765
146 2014-07-03 # this should be the mean of july and august
148 2014-09-04 -0.33748676
149 2014-10-02 # this should be the mean of october and november
151 2014-12-04 -0.33194573
这应该适用于所有年份。
有人能帮我吗?
谢谢!
如果您想手动从完整数据中删除月份,您可以执行以下操作:
library(dplyr)
missingMonths <- c(2, 5, 8, 11)
df %>%
group_by(group = cumsum(!as.numeric(format(Date, '%m')) %in% missingMonths)) %>%
summarise(Date = first(Date),
Sentiment = mean(Sentiment)) %>%
select(-group)
# Date Sentiment
# <date> <dbl>
# 1 2002-01-03 0
# 2 2002-03-07 0
# 3 2002-04-04 -0.0479
# 4 2002-06-06 -0.240
# 5 2002-07-04 0
# 6 2002-09-12 0
# 7 2002-10-10 -0.0945
# 8 2002-12-05 -0.0912
# 9 2003-01-09 -0.0676
#10 2003-03-06 -0.234
# … with 141 more rows
您可以利用months
函数。类似地,我们可以使用substr
来获得年份。这使我们能够使用table
来计数用于子集的零值z
。
z <- rowSums(with(df, table(months(Date), substr(Date, 1, 4))) == 0)
res <- df[months(df$Date) %in% names(z[z == 0]), ]
head(res)
# Date Sentiment
# 1 2002-01-03 0.0000000
# 3 2002-03-07 0.0000000
# 4 2002-04-04 0.0000000
# 6 2002-06-06 -0.2398622
# 7 2002-07-04 0.0000000
# 9 2002-10-10 0.0000000
检查:
with(res, table(months(res$Date), substr(res$Date, 1, 4)))
# 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
# April 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# December 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# January 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# July 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# June 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# March 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# October 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
编辑
要按照下面注释中的描述进行估算,可以使用"flag"
来计算平均值,然后删除标记的行。
res2 <- within(df, {
flag <- ifelse(months(Date) %in% names(z[z == 0]), 0, 1)
Sentiment <- ave(Sentiment, cumsum(flag == 0), FUN=mean)
})
res2 <- res2[res2$flag == 0, 1:2]
head(res2)
# Date Sentiment
# 1 2002-01-03 0.00000000
# 3 2002-03-07 0.00000000
# 4 2002-04-04 -0.04787648
# 6 2002-06-06 -0.23986217
# 7 2002-07-04 0.00000000
# 9 2002-10-10 -0.09446213