我有一个合并表,如下所示:
> dput(data.frame(df))
structure(list(make = c("audi", "audi", "audi", "audi", "bmw",
"bmw", "toyota", "toyota", "toyota", "honda", "honda", "honda",
"honda"), model = c("A3", "A3", "A4", "A4", "3 Series", "3 Series",
"Land Cruiser", "Camry", "Camry", "Accord", "Accord", "civic",
"civic"), variant = c("1.4L TFSI", "1.6L TFSI", "1.6L", "1.8L TFSI Quattro",
"320d", "320d", "4.2L VX AT", "2.4L LE MT", "2.4L LE MT", "2.3L VTI AT",
"2.3L VTI S", "1.8L SPORT", "1.8L V"), from_year = c(2014, 2008,
2004, 2011, 2012, 2015, 1998, 2001, 2006, 2001, 2001, 2009, 2006
), to_year = c(2020, 2012, 2008, 2016, 2015, 2020, 2003, 2006,
2011, 2003, 2003, 2012, 2009), id = c(1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13)), class = "data.frame", row.names = c(NA,
-13L))
数据帧结构:
> df
make model variant from_year to_year id
1 audi A3 1.4L TFSI 2014 2020 1
2 audi A3 1.6L TFSI 2008 2012 2
3 audi A4 1.6L 2004 2008 3
4 audi A4 1.8L TFSI Quattro 2011 2016 4
5 bmw 3 Series 320d 2012 2015 5
6 bmw 3 Series 320d 2015 2020 6
7 toyota Land Cruiser 4.2L VX AT 1998 2003 7
8 toyota Camry 2.4L LE MT 2001 2006 8
9 toyota Camry 2.4L LE MT 2006 2011 9
10 honda Accord 2.3L VTI AT 2001 2003 10
11 honda Accord 2.3L VTI S 2001 2003 11
12 honda civic 1.8L SPORT 2009 2012 12
13 honda civic 1.8L V 2006 2009 13
我需要根据品牌、模型和变体对表进行规范化,每个表都有主键
预期输出:
> dput(df1)
structure(list(make = c("audi", "audi", "audi", "audi", "bmw",
"bmw", "toyota", "toyota", "toyota", "honda", "honda", "honda",
"honda"), make_id = c(1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4),
model = c("A3", "A3", "A4", "A4", "3 Series", "3 Series",
"Land Cruiser", "Camry", "Camry", "Accord", "Accord", "civic",
"civic"), vehicle_model_id = c(1, 1, 2, 2, 3, 3, 4, 5, 5,
6, 6, 7, 7), variant = c("1.4L TFSI", "1.6L TFSI", "1.6L",
"1.8L TFSI Quattro", "320d", "320d", "4.2L VX AT", "2.4L LE MT",
"2.4L LE MT", "2.3L VTI AT", "2.3L VTI S", "1.8L SPORT",
"1.8L V"), from_year = c(2014, 2008, 2004, 2011, 2012, 2015,
1998, 2001, 2006, 2001, 2001, 2009, 2006), to_year = c(2020,
2012, 2008, 2016, 2015, 2020, 2003, 2006, 2011, 2003, 2003,
2012, 2009), id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13)), row.names = c(NA, -13L), class = c("tbl_df", "tbl",
"data.frame"))
> df1
# A tibble: 13 x 8
make make_id model vehicle_model_id variant from_year to_year id
<chr> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
1 audi 1 A3 1 1.4L TFSI 2014 2020 1
2 audi 1 A3 1 1.6L TFSI 2008 2012 2
3 audi 1 A4 2 1.6L 2004 2008 3
4 audi 1 A4 2 1.8L TFSI Quattro 2011 2016 4
5 bmw 2 3 Series 3 320d 2012 2015 5
6 bmw 2 3 Series 3 320d 2015 2020 6
7 toyota 3 Land Cruiser 4 4.2L VX AT 1998 2003 7
8 toyota 3 Camry 5 2.4L LE MT 2001 2006 8
9 toyota 3 Camry 5 2.4L LE MT 2006 2011 9
10 honda 4 Accord 6 2.3L VTI AT 2001 2003 10
11 honda 4 Accord 6 2.3L VTI S 2001 2003 11
12 honda 4 civic 7 1.8L SPORT 2009 2012 12
13 honda 4 civic 7 1.8L V 2006 2009 13
>
有人能告诉我如何根据"id"中字符串的唯一值获得如上所示的新"id"列吗;制造";以及";型号";列。
一旦我得到了上面的输出,我就可以得到如下规范化的表。
> manufacturers
id make
1 audi
2 bmw
3 toyota
4 honda
> models
id make_id model
1 1 A3
2 1 A4
3 2 3 Series
4 3 Land Cruiser
5 3 Camry
6 4 Accord
7 4 civic
> variants
id name vehicle_model_id from_year to_year
1 1.4 TFSI 1 2014 2020
2 1.6 TFSI 1 2008 2012
3 1.6L 2 2004 2008
4 1.8L TFSI Quattro 2 2011 2016
..
..
以下是使用dplyr:的方法
制造商
library(dplyr)
df %>%
mutate(id = as.numeric(as.factor(make))) %>%
group_by(make) %>%
summarise(id = mean(id)) %>%
select(2:1)
#> # A tibble: 4 x 2
#> id make
#> <dbl> <chr>
#> 1 1 audi
#> 2 2 bmw
#> 3 3 honda
#> 4 4 toyota
型号
df %>% mutate(make_id = as.numeric(as.factor(make))) %>%
group_by(model) %>%
summarize(make_id = mean(make_id)) %>%
arrange(make_id) %>%
mutate(id = row_number()) %>%
select(c(3, 1, 2))
#> # A tibble: 7 x 3
#> id model make_id
#> <int> <chr> <dbl>
#> 1 1 A3 1
#> 2 2 A4 1
#> 3 3 3 Series 2
#> 4 4 Accord 3
#> 5 5 civic 3
#> 6 6 Camry 4
#> 7 7 Land Cruiser 4
变体
df %>% mutate(model_id = as.numeric(as.factor(model))) %>%
group_by(variant) %>%
summarize(vehicle_model_id = mean(model_id),
from_year = from_year,
to_year = to_year) %>%
arrange(vehicle_model_id) %>%
ungroup() %>%
mutate(id = row_number()) %>%
select(5, 1:4)
#> # A tibble: 13 x 5
#> id variant vehicle_model_id from_year to_year
#> <int> <chr> <dbl> <dbl> <dbl>
#> 1 1 320d 1 2012 2015
#> 2 2 320d 1 2015 2020
#> 3 3 1.4L TFSI 2 2014 2020
#> 4 4 1.6L TFSI 2 2008 2012
#> 5 5 1.6L 3 2004 2008
#> 6 6 1.8L TFSI Quattro 3 2011 2016
#> 7 7 2.3L VTI AT 4 2001 2003
#> 8 8 2.3L VTI S 4 2001 2003
#> 9 9 2.4L LE MT 5 2001 2006
#> 10 10 2.4L LE MT 5 2006 2011
#> 11 11 1.8L SPORT 6 2009 2012
#> 12 12 1.8L V 6 2006 2009
#> 13 13 4.2L VX AT 7 1998 2003
由reprex包(v0.3.0(于2020-07-09创建
library(dplyr)
df <- data.frame(values = c("A", "A", "B", "B", "C", "A"))
df <- df %>% mutate(id = as.numeric(factor(values)))
只需factor
大小,然后对字符串进行编号。如果您希望根据数字的外观而不是字母顺序来定义数字,则可以根据unique
值来定义levels=
。
df <- transform(df,
make_id=as.numeric(factor(make, levels=unique(make))),
vehicle_model_id=as.numeric(factor(model, levels=unique(model))))
df
# make model variant from_year to_year id make_id vehicle_model_id
# 1 audi A3 1.4L TFSI 2014 2020 1 1 1
# 2 audi A3 1.6L TFSI 2008 2012 2 1 1
# 3 audi A4 1.6L 2004 2008 3 1 2
# 4 audi A4 1.8L TFSI Quattro 2011 2016 4 1 2
# 5 bmw 3 Series 320d 2012 2015 5 2 3
# 6 bmw 3 Series 320d 2015 2020 6 2 3
# 7 toyota Land Cruiser 4.2L VX AT 1998 2003 7 3 4
# 8 toyota Camry 2.4L LE MT 2001 2006 8 3 5
# 9 toyota Camry 2.4L LE MT 2006 2011 9 3 5
# 10 honda Accord 2.3L VTI AT 2001 2003 10 4 6
# 11 honda Accord 2.3L VTI S 2001 2003 11 4 6
# 12 honda civic 1.8L SPORT 2009 2012 12 4 7
# 13 honda civic 1.8L V 2006 2009 13 4 7