r语言 - 组合来自两个数据框架的信息



我有两个数据表,一个是不同地点的采样时间点列表,另一个是该地区农场的列表,其中包含每个农场开放、关闭的日期,以及农场附近的采样区域。

我想为采样时间点数据框创建一个新列,该列返回采样时在该位置附近运行的农场数量。我可以通过过滤农场数据框架来单独完成这个任务,当采集样本并位于样本区域时,这些农场正在运行,但是我想要一种自动化的方法。我认为我需要的是建立一个函数,将运行每一行采样时间点数据框通过当前的代码,但它需要从两个数据框的输入,我不知道该怎么做。任何帮助都是感激的!

目前我如何做到这一点一个接一个手动插入"样本日期"one_answers"样本位置"

farms %>%
filter(as.Date(Open.Date) < as.Date("Sample.Date")) %>%
filter(as.Date(Closure.Date) > as.Date("Sample.Date")) %>%
filter(grepl("Sample.Location", Nearby.Farm.Locations)) %>%
count()

——添加——

采样数据帧的示例(只是一个小子集)

structure(list(Sample.Date = structure(c(2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
1L), .Label = c("29/06/2004", "29/06/2015"), class = "factor"), 
Location = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 
3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L), .Label = c("Orchard Bay ", 
"Port Ligar ", "Rams Head ", "Skiddaw ", "Te Puraka Point ", 
"Waitata Reach "), class = "factor")), class = "data.frame", row.names = c(NA, 
-22L))

和一个农场数据框架的例子

structure(list(Farm.Number = c(8103L, 8107L, 8108L, 8109L, 8110L, 
8111L, 8112L, 8113L, 8114L, 8115L, 8116L, 8117L, 8118L, 8119L, 
8120L, 8121L, 8122L, 8123L, 8124L, 8125L, 8126L, 8127L, 8128L, 
8129L, 8130L, 8131L, 8132L, 8133L), Start.Date = structure(c(23L, 
16L, 4L, 7L, 25L, 14L, 10L, 20L, 2L, 12L, 3L, 8L, 14L, 11L, 9L, 
1L, 19L, 22L, 5L, 18L, 17L, 13L, 24L, 15L, 26L, 15L, 21L, 6L), .Label = c("1/07/1982", 
"10/06/1994", "11/04/1990", "11/04/1997", "13/05/1982", "13/08/1980", 
"14/05/1996", "14/09/1983", "15/09/1982", "16/03/1981", "17/03/1997", 
"21/01/1986", "23/05/1989", "23/07/2004", "23/12/1982", "25/08/2000", 
"27/06/1983", "27/10/1981", "28/09/1982", "28/10/1983", "29/01/1981", 
"29/01/1982", "29/09/1997", "30/01/2001", "30/06/1982", "4/08/1980"
), class = "factor"), End.Date = structure(c(6L, 8L, 13L, 10L, 
10L, 3L, 10L, 9L, 7L, 10L, 1L, 10L, 4L, 12L, 10L, 10L, 10L, 5L, 
10L, 10L, 10L, 10L, 11L, 2L, 10L, 10L, 10L, 10L), .Label = c("1/02/2039", 
"1/06/2039", "1/06/2041", "1/07/2041", "1/10/2040", "14/09/2007", 
"20/04/2028", "24/08/2020", "30/06/2034", "31/12/2024", "7/03/2039", 
"7/11/2030", "7/11/2031"), class = "factor"), Nearby.Locations = structure(c(6L, 
5L, 5L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 4L, 4L), .Label = c("", "Anakoha Bay", 
"Forsyth Bay; Orchard Bay", "Forsyth Bay; Orchard Bay; Anakoha Bay", 
"Port Ligar; Forsyth Bay; Orchard Bay ", "Waitata Reach "), class = "factor")), class = "data.frame", row.names = c(NA, 
-28L))

然后是一个基于上面两个数据框的示例(根据采样时该地区有多少农场在活动,在采样数据框中添加一列):

structure(list(Sample.Date = structure(c(2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
1L), .Label = c("29/06/2004", "29/06/2015"), class = "factor"), 
Location = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 
3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L), .Label = c("Orchard Bay ", 
"Port Ligar ", "Rams Head ", "Skiddaw ", "Te Puraka Point ", 
"Waitata Reach "), class = "factor"), Nearby.Farms = c(16L, 
16L, 16L, 3L, 3L, 3L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA, 
-22L))
library(tidyverse)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
samplings <- structure(list(
Sample.Date = structure(c(
2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L
), .Label = c("29/06/2004", "29/06/2015"), class = "factor"),
Location = structure(c(
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L
), .Label = c(
"Orchard Bay ",
"Port Ligar ", "Rams Head ", "Skiddaw ", "Te Puraka Point ",
"Waitata Reach "
), class = "factor")
), class = "data.frame", row.names = c(
NA,
-22L
))
farms <- structure(list(Farm.Number = c(
8103L, 8107L, 8108L, 8109L, 8110L,
8111L, 8112L, 8113L, 8114L, 8115L, 8116L, 8117L, 8118L, 8119L,
8120L, 8121L, 8122L, 8123L, 8124L, 8125L, 8126L, 8127L, 8128L,
8129L, 8130L, 8131L, 8132L, 8133L
), Start.Date = structure(c(
23L,
16L, 4L, 7L, 25L, 14L, 10L, 20L, 2L, 12L, 3L, 8L, 14L, 11L, 9L,
1L, 19L, 22L, 5L, 18L, 17L, 13L, 24L, 15L, 26L, 15L, 21L, 6L
), .Label = c(
"1/07/1982",
"10/06/1994", "11/04/1990", "11/04/1997", "13/05/1982", "13/08/1980",
"14/05/1996", "14/09/1983", "15/09/1982", "16/03/1981", "17/03/1997",
"21/01/1986", "23/05/1989", "23/07/2004", "23/12/1982", "25/08/2000",
"27/06/1983", "27/10/1981", "28/09/1982", "28/10/1983", "29/01/1981",
"29/01/1982", "29/09/1997", "30/01/2001", "30/06/1982", "4/08/1980"
), class = "factor"), End.Date = structure(c(
6L, 8L, 13L, 10L,
10L, 3L, 10L, 9L, 7L, 10L, 1L, 10L, 4L, 12L, 10L, 10L, 10L, 5L,
10L, 10L, 10L, 10L, 11L, 2L, 10L, 10L, 10L, 10L
), .Label = c(
"1/02/2039",
"1/06/2039", "1/06/2041", "1/07/2041", "1/10/2040", "14/09/2007",
"20/04/2028", "24/08/2020", "30/06/2034", "31/12/2024", "7/03/2039",
"7/11/2030", "7/11/2031"
), class = "factor"), Nearby.Locations = structure(c(
6L,
5L, 5L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 4L, 4L
), .Label = c(
"", "Anakoha Bay",
"Forsyth Bay; Orchard Bay", "Forsyth Bay; Orchard Bay; Anakoha Bay",
"Port Ligar; Forsyth Bay; Orchard Bay ", "Waitata Reach "
), class = "factor")), class = "data.frame", row.names = c(
NA,
-28L
))
samplings <-
samplings %>%
# clen up trailing whitespaces
mutate(Location = Location %>% str_trim()) %>%
as_tibble() %>%
# parse date
mutate(Sample.Date = Sample.Date %>% as.character() %>% parse_date("%d/%m/%Y"))
samplings
#> # A tibble: 22 x 2
#>    Sample.Date Location   
#>    <date>      <chr>      
#>  1 2015-06-29  Orchard Bay
#>  2 2015-06-29  Orchard Bay
#>  3 2015-06-29  Orchard Bay
#>  4 2015-06-29  Port Ligar 
#>  5 2015-06-29  Port Ligar 
#>  6 2015-06-29  Port Ligar 
#>  7 2015-06-29  Port Ligar 
#>  8 2015-06-29  Rams Head  
#>  9 2015-06-29  Rams Head  
#> 10 2015-06-29  Rams Head  
#> # … with 12 more rows
farms <-
farms %>%
# normalize farms to have one location per row
mutate(
Nearby.Locations = Nearby.Locations %>% map(~ .x %>%
str_split("; ") %>%
simplify() %>%
map_chr(str_trim))
) %>%
unnest(Nearby.Locations) %>%
rename(Location = Nearby.Locations) %>%
as_tibble() %>%
# Parse date columns
mutate_at(c("End.Date", "Start.Date"), ~ .x %>%
as.character() %>%
parse_date("%d/%m/%Y"))
farms
#> # A tibble: 53 x 4
#>    Farm.Number Start.Date End.Date   Location     
#>          <int> <date>     <date>     <chr>        
#>  1        8103 1997-09-29 2007-09-14 Waitata Reach
#>  2        8107 2000-08-25 2020-08-24 Port Ligar   
#>  3        8107 2000-08-25 2020-08-24 Forsyth Bay  
#>  4        8107 2000-08-25 2020-08-24 Orchard Bay  
#>  5        8108 1997-04-11 2031-11-07 Port Ligar   
#>  6        8108 1997-04-11 2031-11-07 Forsyth Bay  
#>  7        8108 1997-04-11 2031-11-07 Orchard Bay  
#>  8        8109 1996-05-14 2024-12-31 Port Ligar   
#>  9        8109 1996-05-14 2024-12-31 Forsyth Bay  
#> 10        8109 1996-05-14 2024-12-31 Orchard Bay  
#> # … with 43 more rows
samplings %>%
left_join(farms) %>%
filter(Sample.Date %within% interval(Start.Date, End.Date)) %>%
# count the number of farms regardless of how many samples there are
distinct(Sample.Date, Location, Farm.Number) %>%
count(Sample.Date, Location)
#> Joining, by = "Location"
#> # A tibble: 3 x 3
#>   Sample.Date Location          n
#>   <date>      <chr>         <int>
#> 1 2004-06-29  Waitata Reach     1
#> 2 2015-06-29  Orchard Bay      16
#> 3 2015-06-29  Port Ligar        3

由reprex包(v2.0.1)于2021-09-10创建

最新更新