根据行计数的唯一值向r中的数据帧添加一列



我有一个合并表,如下所示:

> dput(data.frame(df))
structure(list(make = c("audi", "audi", "audi", "audi", "bmw", 
"bmw", "toyota", "toyota", "toyota", "honda", "honda", "honda", 
"honda"), model = c("A3", "A3", "A4", "A4", "3 Series", "3 Series", 
"Land Cruiser", "Camry", "Camry", "Accord", "Accord", "civic", 
"civic"), variant = c("1.4L TFSI", "1.6L TFSI", "1.6L", "1.8L TFSI Quattro", 
"320d", "320d", "4.2L VX AT", "2.4L LE MT", "2.4L LE MT", "2.3L VTI AT", 
"2.3L VTI S", "1.8L SPORT", "1.8L V"), from_year = c(2014, 2008, 
2004, 2011, 2012, 2015, 1998, 2001, 2006, 2001, 2001, 2009, 2006
), to_year = c(2020, 2012, 2008, 2016, 2015, 2020, 2003, 2006, 
2011, 2003, 2003, 2012, 2009), id = c(1, 2, 3, 4, 5, 6, 7, 8, 
9, 10, 11, 12, 13)), class = "data.frame", row.names = c(NA, 
-13L))

数据帧结构:

> df
make        model           variant from_year to_year id
1    audi           A3         1.4L TFSI      2014    2020  1
2    audi           A3         1.6L TFSI      2008    2012  2
3    audi           A4              1.6L      2004    2008  3
4    audi           A4 1.8L TFSI Quattro      2011    2016  4
5     bmw     3 Series              320d      2012    2015  5
6     bmw     3 Series              320d      2015    2020  6
7  toyota Land Cruiser        4.2L VX AT      1998    2003  7
8  toyota        Camry        2.4L LE MT      2001    2006  8
9  toyota        Camry        2.4L LE MT      2006    2011  9
10  honda       Accord       2.3L VTI AT      2001    2003 10
11  honda       Accord        2.3L VTI S      2001    2003 11
12  honda        civic        1.8L SPORT      2009    2012 12
13  honda        civic            1.8L V      2006    2009 13

我需要根据品牌、模型和变体对表进行规范化,每个表都有主键

预期输出:

> dput(df1)
structure(list(make = c("audi", "audi", "audi", "audi", "bmw", 
"bmw", "toyota", "toyota", "toyota", "honda", "honda", "honda", 
"honda"), make_id = c(1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4), 
model = c("A3", "A3", "A4", "A4", "3 Series", "3 Series", 
"Land Cruiser", "Camry", "Camry", "Accord", "Accord", "civic", 
"civic"), vehicle_model_id = c(1, 1, 2, 2, 3, 3, 4, 5, 5, 
6, 6, 7, 7), variant = c("1.4L TFSI", "1.6L TFSI", "1.6L", 
"1.8L TFSI Quattro", "320d", "320d", "4.2L VX AT", "2.4L LE MT", 
"2.4L LE MT", "2.3L VTI AT", "2.3L VTI S", "1.8L SPORT", 
"1.8L V"), from_year = c(2014, 2008, 2004, 2011, 2012, 2015, 
1998, 2001, 2006, 2001, 2001, 2009, 2006), to_year = c(2020, 
2012, 2008, 2016, 2015, 2020, 2003, 2006, 2011, 2003, 2003, 
2012, 2009), id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 
13)), row.names = c(NA, -13L), class = c("tbl_df", "tbl", 
"data.frame"))
> df1
# A tibble: 13 x 8
make   make_id model        vehicle_model_id variant           from_year to_year    id
<chr>    <dbl> <chr>                   <dbl> <chr>                 <dbl>   <dbl> <dbl>
1 audi         1 A3                          1 1.4L TFSI              2014    2020     1
2 audi         1 A3                          1 1.6L TFSI              2008    2012     2
3 audi         1 A4                          2 1.6L                   2004    2008     3
4 audi         1 A4                          2 1.8L TFSI Quattro      2011    2016     4
5 bmw          2 3 Series                    3 320d                   2012    2015     5
6 bmw          2 3 Series                    3 320d                   2015    2020     6
7 toyota       3 Land Cruiser                4 4.2L VX AT             1998    2003     7
8 toyota       3 Camry                       5 2.4L LE MT             2001    2006     8
9 toyota       3 Camry                       5 2.4L LE MT             2006    2011     9
10 honda        4 Accord                      6 2.3L VTI AT            2001    2003    10
11 honda        4 Accord                      6 2.3L VTI S             2001    2003    11
12 honda        4 civic                       7 1.8L SPORT             2009    2012    12
13 honda        4 civic                       7 1.8L V                 2006    2009    13
> 

有人能告诉我如何根据"id"中字符串的唯一值获得如上所示的新"id"列吗;制造";以及";型号";列。

一旦我得到了上面的输出,我就可以得到如下规范化的表。

> manufacturers
id    make
1     audi
2     bmw
3     toyota
4     honda
> models
id    make_id    model
1     1          A3
2     1          A4
3     2          3 Series
4     3          Land Cruiser
5     3          Camry
6     4          Accord
7     4          civic
> variants
id    name                 vehicle_model_id     from_year      to_year
1     1.4 TFSI             1                    2014           2020
2     1.6 TFSI             1                    2008           2012
3     1.6L                 2                    2004           2008
4     1.8L TFSI Quattro    2                    2011           2016
..
..

以下是使用dplyr:的方法

制造商

library(dplyr)
df %>%
mutate(id = as.numeric(as.factor(make))) %>% 
group_by(make) %>% 
summarise(id = mean(id)) %>% 
select(2:1)
#> # A tibble: 4 x 2
#>      id make  
#>   <dbl> <chr> 
#> 1     1 audi  
#> 2     2 bmw   
#> 3     3 honda 
#> 4     4 toyota

型号

df %>% mutate(make_id = as.numeric(as.factor(make))) %>%
group_by(model) %>% 
summarize(make_id = mean(make_id)) %>%
arrange(make_id) %>%
mutate(id = row_number()) %>%
select(c(3, 1, 2))
#> # A tibble: 7 x 3
#>      id model        make_id
#>   <int> <chr>          <dbl>
#> 1     1 A3                 1
#> 2     2 A4                 1
#> 3     3 3 Series           2
#> 4     4 Accord             3
#> 5     5 civic              3
#> 6     6 Camry              4
#> 7     7 Land Cruiser       4

变体

df %>% mutate(model_id = as.numeric(as.factor(model))) %>%
group_by(variant) %>% 
summarize(vehicle_model_id = mean(model_id),
from_year = from_year,
to_year = to_year) %>%
arrange(vehicle_model_id) %>%
ungroup() %>%
mutate(id = row_number()) %>%
select(5, 1:4)
#> # A tibble: 13 x 5
#>       id variant           vehicle_model_id from_year to_year
#>    <int> <chr>                        <dbl>     <dbl>   <dbl>
#>  1     1 320d                             1      2012    2015
#>  2     2 320d                             1      2015    2020
#>  3     3 1.4L TFSI                        2      2014    2020
#>  4     4 1.6L TFSI                        2      2008    2012
#>  5     5 1.6L                             3      2004    2008
#>  6     6 1.8L TFSI Quattro                3      2011    2016
#>  7     7 2.3L VTI AT                      4      2001    2003
#>  8     8 2.3L VTI S                       4      2001    2003
#>  9     9 2.4L LE MT                       5      2001    2006
#> 10    10 2.4L LE MT                       5      2006    2011
#> 11    11 1.8L SPORT                       6      2009    2012
#> 12    12 1.8L V                           6      2006    2009
#> 13    13 4.2L VX AT                       7      1998    2003

由reprex包(v0.3.0(于2020-07-09创建

library(dplyr)
df <- data.frame(values = c("A", "A", "B", "B", "C", "A"))
df <- df %>% mutate(id = as.numeric(factor(values)))

只需factor大小,然后对字符串进行编号。如果您希望根据数字的外观而不是字母顺序来定义数字,则可以根据unique值来定义levels=

df <- transform(df, 
make_id=as.numeric(factor(make, levels=unique(make))),
vehicle_model_id=as.numeric(factor(model, levels=unique(model))))
df
#      make        model           variant from_year to_year id make_id vehicle_model_id
# 1    audi           A3         1.4L TFSI      2014    2020  1       1                1
# 2    audi           A3         1.6L TFSI      2008    2012  2       1                1
# 3    audi           A4              1.6L      2004    2008  3       1                2
# 4    audi           A4 1.8L TFSI Quattro      2011    2016  4       1                2
# 5     bmw     3 Series              320d      2012    2015  5       2                3
# 6     bmw     3 Series              320d      2015    2020  6       2                3
# 7  toyota Land Cruiser        4.2L VX AT      1998    2003  7       3                4
# 8  toyota        Camry        2.4L LE MT      2001    2006  8       3                5
# 9  toyota        Camry        2.4L LE MT      2006    2011  9       3                5
# 10  honda       Accord       2.3L VTI AT      2001    2003 10       4                6
# 11  honda       Accord        2.3L VTI S      2001    2003 11       4                6
# 12  honda        civic        1.8L SPORT      2009    2012 12       4                7
# 13  honda        civic            1.8L V      2006    2009 13       4                7

相关内容

  • 没有找到相关文章

最新更新