我对堆栈溢出还很陌生,所以请告诉我是否有更好的方法将数据或其他格式问题包括在我的问题中。谢谢
我有2个数据帧。其中一个包含我需要的具有唯一参考号的单行数据。
我需要将第一个数据帧中的Ph和Dissolved02合并到具有纬度和经度的数据帧中。但我只想得到每个唯一参考号最后一行的值,或者换句话说,最深的pH值和溶解02值。最终的数据帧中每个参考编号只出现一次。每个数据帧的样本可以用以下代码创建(也许是将数据输入堆栈溢出的更简单的方法?(。。。
sample.df <- readLines(textConnection("BBM2008050101 0.2 B 24.8 52.1 8.2 34.3 6.1
BBM2008050101 1.0 B 24.8 52.4 8.2 34.5 6.1
BBM2008050101 1.4 B 24.8 52.4 8.2 34.5 6.1
BBM2008050102 0.2 B 24.5 53.0 8.1 35.0 6.3
BBM2008050102 1.0 B 24.5 53.0 8.1 34.9 6.0
BBM2008050102 1.6 B 24.5 53.0 8.1 35.0 5.9
BBM2008050103 0.2 B 24.9 51.1 8.2 33.5 6.1
BBM2008050103 1.0 B 24.9 51.1 8.2 33.5 6.1
BBM2008050103 1.6 B 24.9 51.1 8.2 33.5 6.1
BBM2008050104 0.2 B 25.1 51.4 8.2 33.8 6.7
BBM2008050104 1.0 B 25.1 51.4 8.2 33.8 6.5
BBM2008050104 1.6 B 25.1 51.4 8.2 33.8 6.5
BBM2008050105 0.2 B 24.9 51.9 8.1 34.1 7.7
BBM2008050105 1.0 B 24.9 51.9 8.2 34.1 7.9
BBM2008050106 0.2 B 25.4 51.1 8.3 33.5 7.0
BBM2008050106 1.0 B 25.4 51.1 8.3 33.5 6.5
BBM2008050106 2.0 B 25.4 51.1 8.3 33.5 6.5
BBM2008050106 2.3 B 25.4 51.1 8.3 33.5 6.4 "))
sample.df <- strsplit(sample.df,"[[:space:]]+")
max.len <- max(sapply(sample.df, length))
corrected.list <- lapply(sample.df, function(x) {c(x, rep(NA, max.len - length(x)))})
df <- do.call(rbind, corrected.list)
colnames(df) <- c("Reference", "Depth", "Beg_end", "Temperature", "Conductivity", "pH", "Salinity", "DissolvedO2")
df <- as.data.frame(df)
sample.df2 <- readLines(textConnection("BBM2008050101 301 -83.44165 29.637633 1.6 D
BBM2008050102 301 -83.439717 29.630233 1.8 D
BBM2008050103 301 -83.434017 29.605567 1.8 D
BBM2008050104 301 -83.440067 29.596267 1.8 D
BBM2008050105 301 -83.4346 29.592667 1.2 D
BBM2008050106 300 -83.44555 29.596917 2.5 D"))
sample.df2 <- strsplit(sample.df2,"[[:space:]]+")
max.len2 <- max(sapply(sample.df2, length))
corrected.list2 <- lapply(sample.df2, function(x) {c(x, rep(NA, max.len2 - length(x)))})
df2 <- do.call(rbind, corrected.list2)
colnames(df2) <- c("Reference", "Gear", "Longitude", "Latitude", "StartDepth", "Zone")
df2 <- as.data.frame(df2)
输出将是sample.df3,并添加最深的Ph和Dissolved02柱。就像下面一样,但显然我的数据帧要大得多,我无法手动完成。
sample.df3 <- readLines(textConnection("BBM2008050101 301 -83.44165 29.637633 1.6 D 8.2 6.1
BBM2008050102 301 -83.439717 29.630233 1.8 D 8.1 5.9
BBM2008050103 301 -83.434017 29.605567 1.8 D 8.2 6.1
BBM2008050104 301 -83.440067 29.596267 1.8 D 8.2 6.5
BBM2008050105 301 -83.4346 29.592667 1.2 D 8.2 7.9
BBM2008050106 300 -83.44555 29.596917 2.5 D 8.3 6.4"))
sample.df3 <- strsplit(sample.df3,"[[:space:]]+")
max.len3 <- max(sapply(sample.df3, length))
corrected.list3 <- lapply(sample.df3, function(x) {c(x, rep(NA, max.len3 - length(x)))})
df3 <- do.call(rbind, corrected.list3)
colnames(df3) <- c("Reference", "Gear", "Longitude", "Latitude", "StartDepth", "Zone", "pH", "Dissolved02")
df3 <- as.data.frame(df3)
下面使用dplyr
的group_by和summary来获得引用发生的最后一行,然后为每个引用在最后一行过滤DF1,最后合并到DF2 的所有列中
library(dplyr)
df$id <- c(1:nrow(df)) # Create ID Column to store row number
# Create a smaller df with just the references and the max row number (which should equal the last occurance)
df1_last_references <- df %>%
group_by(Reference) %>%
summarise(id = max(id))
# Filter Original DF on the row numbers matching from the last references
df <- df[which(df$id %in% df1_last_references$id), ]
# merge in the columns from DF2
df3 <- merge(df, df2, all.x = T, by = 'Reference')
head(df3)
Reference Gear Longitude Latitude StartDepth Zone pH Dissolved02
1 BBM2008050101 301 -83.44165 29.637633 1.6 D 8.2 6.1
2 BBM2008050102 301 -83.439717 29.630233 1.8 D 8.1 5.9
3 BBM2008050103 301 -83.434017 29.605567 1.8 D 8.2 6.1
4 BBM2008050104 301 -83.440067 29.596267 1.8 D 8.2 6.5
5 BBM2008050105 301 -83.4346 29.592667 1.2 D 8.2 7.9
6 BBM2008050106 300 -83.44555 29.596917 2.5 D 8.3 6.4
使用data.table
:的选项
DT2[, c("pH", "Dissolved02") :=
DT1[.SD, on=.(Reference), mult="last", .(pH, DissolvedO2)]
]
输出(DT2
(:
Reference Gear Longitude Latitude StartDepth Zone pH Dissolved02
1: BBM2008050101 301 -83.44165 29.63763 1.6 D 8.2 6.1
2: BBM2008050102 301 -83.43972 29.63023 1.8 D 8.1 5.9
3: BBM2008050103 301 -83.43402 29.60557 1.8 D 8.2 6.1
4: BBM2008050104 301 -83.44007 29.59627 1.8 D 8.2 6.5
5: BBM2008050105 301 -83.43460 29.59267 1.2 D 8.2 7.9
6: BBM2008050106 300 -83.44555 29.59692 2.5 D 8.3 6.4
数据:
library(data.table)
DT1 <- fread("Reference Depth Beg_end Temperature Conductivity pH Salinity DissolvedO2
BBM2008050101 0.2 B 24.8 52.1 8.2 34.3 6.1
BBM2008050101 1.0 B 24.8 52.4 8.2 34.5 6.1
BBM2008050101 1.4 B 24.8 52.4 8.2 34.5 6.1
BBM2008050102 0.2 B 24.5 53.0 8.1 35.0 6.3
BBM2008050102 1.0 B 24.5 53.0 8.1 34.9 6.0
BBM2008050102 1.6 B 24.5 53.0 8.1 35.0 5.9
BBM2008050103 0.2 B 24.9 51.1 8.2 33.5 6.1
BBM2008050103 1.0 B 24.9 51.1 8.2 33.5 6.1
BBM2008050103 1.6 B 24.9 51.1 8.2 33.5 6.1
BBM2008050104 0.2 B 25.1 51.4 8.2 33.8 6.7
BBM2008050104 1.0 B 25.1 51.4 8.2 33.8 6.5
BBM2008050104 1.6 B 25.1 51.4 8.2 33.8 6.5
BBM2008050105 0.2 B 24.9 51.9 8.1 34.1 7.7
BBM2008050105 1.0 B 24.9 51.9 8.2 34.1 7.9
BBM2008050106 0.2 B 25.4 51.1 8.3 33.5 7.0
BBM2008050106 1.0 B 25.4 51.1 8.3 33.5 6.5
BBM2008050106 2.0 B 25.4 51.1 8.3 33.5 6.5
BBM2008050106 2.3 B 25.4 51.1 8.3 33.5 6.4")
DT2 <- fread("Reference Gear Longitude Latitude StartDepth Zone
BBM2008050101 301 -83.44165 29.637633 1.6 D
BBM2008050102 301 -83.439717 29.630233 1.8 D
BBM2008050103 301 -83.434017 29.605567 1.8 D
BBM2008050104 301 -83.440067 29.596267 1.8 D
BBM2008050105 301 -83.4346 29.592667 1.2 D
BBM2008050106 300 -83.44555 29.596917 2.5 D")