数据
以下是我正在处理的数据子集的dput
:
crime_state <- structure(list(State = c("ALABAMA", "", "ARIZONA", "", "", "",
"", "", "", "", "", "", "ARKANSAS", "CALIFORNIA", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
""), City = c("HUNTSVILLE4", "TUSCALOOSA", "CHANDLER", "GILBERT",
"GLENDALE", "MESA", "PEORIA", "PHOENIX", "SCOTTSDALE", "SURPRISE",
"TEMPE", "TUCSON", "LITTLE ROCK", "ANAHEIM", "ANTIOCH", "BAKERSFIELD",
"BERKELEY", "BURBANK", "CARLSBAD", "CHULA VISTA", "CLOVIS", "CONCORD",
"CORONA", "COSTA MESA", "DALY CITY", "DOWNEY", "EL CAJON", "EL MONTE",
"ELK GROVE", "ESCONDIDO", "FAIRFIELD", "FONTANA", "FREMONT",
"FRESNO", "FULLERTON", "GARDEN GROVE", "GLENDALE", "HAYWARD",
"HUNTINGTON BEACH", "INGLEWOOD", "IRVINE", "JURUPA VALLEY", "LANCASTER",
"LONG BEACH", "LOS ANGELES", "MODESTO", "MORENO VALLEY", "MURRIETA",
"NORWALK", "OAKLAND"), X = c(2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L
), Population1 = c("196,620", "101,764", "255,986", "247,463",
"249,799", "504,873", "170,177", "1,653,080", "254,961", "136,611",
"188,543", "537,392", "199,288", "354,743", "112,956", "385,609",
"123,735", "105,041", "116,739", "274,370", "111,759", "130,855",
"170,041", "114,358", "107,928", "113,277", "104,497", "116,464",
"174,651", "153,073", "117,883", "213,964", "238,024", "531,818",
"141,132", "174,661", "204,724", "162,881", "203,428", "110,726",
"288,052", "107,605", "160,818", "470,445", "4,029,741", "215,822",
"209,145", "114,706", "106,158", "430,230"), Violent..crime = c("",
"253", "287", "132", "594", "967", "201", "6,118", "211", "67",
"448", "2,027", "1,336", "578", "283", "911", "270", "129", "115",
"406", "106", "246", "127", "170", "113", "174", "246", "181",
"198", "237", "335", "331", "274", "1,489", "174", "266", "83",
"330", "218", "349", "82", "134", "586", "1,702", "14,709", "963",
"396", "49", "219", "2,675"), Murder = c(NA, 3L, 1L, 1L, 7L,
10L, 2L, 67L, 7L, 3L, 3L, 26L, 19L, 2L, 1L, 18L, 1L, 0L, 1L,
4L, 1L, 1L, 4L, 3L, 2L, 3L, 2L, 2L, 0L, 1L, 2L, 5L, 1L, 18L,
0L, 2L, 0L, 0L, 0L, 7L, 0L, 5L, 4L, 11L, 132L, 8L, 7L, 2L, 5L,
32L), Rape2 = c("", "20", "74", "49", "69", "143", "33", "566",
"53", "10", "75", "255", "121", "70", "24", "60", "31", "7",
"19", "43", "23", "27", "27", "35", "18", "7", "20", "12", "16",
"28", "53", "27", "32", "93", "27", "28", "16", "55", "41", "27",
"22", "8", "49", "107", "1,324", "47", "17", "5", "4", "214"),
Robbery = c("", "71", "62", "15", "171", "192", "29", "1,438",
"52", "20", "97", "670", "150", "195", "102", "402", "167",
"48", "23", "114", "15", "103", "49", "59", "45", "83", "86",
"78", "40", "71", "89", "94", "115", "473", "66", "90", "24",
"170", "59", "156", "22", "45", "142", "527", "5,139", "204",
"141", "18", "61", "1,220"), Aggravated..assault = c("",
"159", "150", "67", "347", "622", "137", "4,047", "99", "34",
"273", "1,076", "1,046", "311", "156", "431", "71", "74",
"72", "245", "67", "115", "47", "73", "48", "81", "138",
"89", "142", "137", "191", "205", "126", "905", "81", "146",
"43", "105", "118", "159", "38", "76", "391", "1,057", "8,114",
"704", "231", "24", "149", "1,209"), Property..crime = c("",
"2,092", "2,771", "1,705", "4,830", "5,113", "1,596", "28,854",
"2,853", "1,088", "3,567", "13,925", "6,236", "4,399", "1,479",
"8,330", "2,581", "1,362", "1,081", "1,905", "1,271", "2,151",
"1,535", "1,911", "719", "1,353", "1,164", "1,216", "1,128",
"1,370", "1,639", "1,683", "2,387", "8,628", "1,878", "2,096",
"1,491", "2,434", "1,871", "1,382", "1,670", "1,389", "1,855",
"5,936", "50,093", "3,965", "2,933", "661", "1,057", "10,677"
), Burglary = c("", "414", "369", "214", "756", "751", "281",
"5,465", "350", "138", "456", "1,690", "1,068", "733", "278",
"1,997", "403", "141", "169", "307", "160", "261", "191",
"265", "119", "243", "166", "257", "147", "187", "211", "289",
"404", "1,411", "176", "329", "188", "262", "227", "221",
"276", "201", "512", "1,077", "7,869", "564", "591", "128",
"222", "1,160"), Larceny..theft = c("", "1,569", "2,241",
"1,409", "3,581", "3,901", "1,224", "19,461", "2,362", "860",
"2,841", "10,959", "4,643", "2,969", "910", "4,875", "1,915",
"1,115", "830", "1,244", "1,024", "1,611", "1,082", "1,486",
"517", "785", "827", "677", "892", "933", "1,121", "928",
"1,620", "6,126", "1,460", "1,438", "1,156", "1,468", "1,485",
"822", "1,307", "791", "981", "3,688", "33,364", "2,819",
"1,828", "435", "604", "7,021"), Motor..vehicle..theft = c("",
"109", "161", "82", "493", "461", "91", "3,928", "141", "90",
"270", "1,276", "525", "697", "291", "1,458", "263", "106",
"82", "354", "87", "279", "262", "160", "83", "325", "171",
"282", "89", "250", "307", "466", "363", "1,091", "242",
"329", "147", "704", "159", "339", "87", "397", "362", "1,171",
"8,860", "582", "514", "98", "231", "2,496"), Arson3 = c(NA,
NA, 8L, 5L, 34L, 7L, 4L, 140L, 7L, 3L, 3L, 92L, 24L, 13L,
28L, 116L, 21L, 7L, 5L, 9L, 2L, 12L, 10L, 12L, 3L, 8L, 9L,
10L, 6L, 5L, 11L, 5L, 6L, 129L, 8L, 6L, 5L, 9L, 9L, 7L, 2L,
0L, 14L, 57L, 897L, 26L, 5L, 2L, 4L, 106L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -50L))
问题
我有这样的数据,每个州都有多个县,因此州行有几个空值:
# A tibble: 276 × 14
State City X Popul…¹ Viole…² Murder Rape2 Robbery Aggra…³
<chr> <chr> <int> <chr> <chr> <int> <chr> <chr> <chr>
1 "ALABAMA" HUNTSVI… 2018 196,620 "" NA "" "" ""
2 "" TUSCALO… 2018 101,764 "253" 3 "20" "71" "159"
3 "ARIZONA" CHANDLER 2018 255,986 "287" 1 "74" "62" "150"
4 "" GILBERT 2018 247,463 "132" 1 "49" "15" "67"
5 "" GLENDALE 2018 249,799 "594" 7 "69" "171" "347"
6 "" MESA 2018 504,873 "967" 10 "143" "192" "622"
7 "" PEORIA 2018 170,177 "201" 2 "33" "29" "137"
8 "" PHOENIX 2018 1,653,… "6,118" 67 "566" "1,438" "4,047"
9 "" SCOTTSD… 2018 254,961 "211" 7 "53" "52" "99"
10 "" SURPRISE 2018 136,611 "67" 3 "10" "20" "34"
对于这部分数据,阿拉巴马州应填写前两行,亚利桑那州应填写最后8行空的数据。6年前,我尝试使用这篇文章中的方法,但使用了以下代码(提供as.character
,因为这里的大多数数据都是这样的(:
library(tidyverse)
crime_state %>%
mutate_all(as.character) %>%
fill(names(.),
.direction = "up")
我需要修理的行没有零钱。我的数据有更好的替代方案吗?这将是理想的tibble:
# A tibble: 276 × 14
State City X Popul…¹ Viole…² Murder Rape2 Robbery Aggra…³
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 "ALABAMA" HUNTSVI… 2018 196,620 "" 3 "" "" ""
2 "ALABAMA" TUSCALO… 2018 101,764 "253" 3 "20" "71" "159"
3 "ARIZONA" CHANDLER 2018 255,986 "287" 1 "74" "62" "150"
4 "ARIZONA" GILBERT 2018 247,463 "132" 1 "49" "15" "67"
5 "ARIZONA" GLENDALE 2018 249,799 "594" 7 "69" "171" "347"
6 "ARIZONA" MESA 2018 504,873 "967" 10 "143" "192" "622"
7 "ARIZONA" PEORIA 2018 170,177 "201" 2 "33" "29" "137"
8 "ARIZONA" PHOENIX 2018 1,653,… "6,118" 67 "566" "1,438" "4,047"
9 "ARIZONA" SCOTTSD… 2018 254,961 "211" 7 "53" "52" "99"
10 "ARIZONA" SURPRISE 2018 136,611 "67" 3 "10" "20" "34"
关于:
[...]
crime_state |>
dplyr::mutate(State = ifelse(State == "", NA, State)) |>
tidyr::fill(State)
#> # A tibble: 50 × 14
#> State City X Popul…¹ Viole…² Murder Rape2 Robbery Aggra…³ Prope…⁴
#> <chr> <chr> <int> <chr> <chr> <int> <chr> <chr> <chr> <chr>
#> 1 ALABAMA HUNTSVILL… 2018 196,620 "" NA "" "" "" ""
#> 2 ALABAMA TUSCALOOSA 2018 101,764 "253" 3 "20" "71" "159" "2,092"
#> 3 ARIZONA CHANDLER 2018 255,986 "287" 1 "74" "62" "150" "2,771"
#> 4 ARIZONA GILBERT 2018 247,463 "132" 1 "49" "15" "67" "1,705"
#> 5 ARIZONA GLENDALE 2018 249,799 "594" 7 "69" "171" "347" "4,830"
#> 6 ARIZONA MESA 2018 504,873 "967" 10 "143" "192" "622" "5,113"
#> 7 ARIZONA PEORIA 2018 170,177 "201" 2 "33" "29" "137" "1,596"
#> 8 ARIZONA PHOENIX 2018 1,653,… "6,118" 67 "566" "1,438" "4,047" "28,85…
#> 9 ARIZONA SCOTTSDALE 2018 254,961 "211" 7 "53" "52" "99" "2,853"
#> 10 ARIZONA SURPRISE 2018 136,611 "67" 3 "10" "20" "34" "1,088"
#> # … with 40 more rows, 4 more variables: Burglary <chr>, Larceny..theft <chr>,
#> # Motor..vehicle..theft <chr>, Arson3 <int>, and abbreviated variable names
#> # ¹Population1, ²Violent..crime, ³Aggravated..assault, ⁴Property..crime
创建于2022-10-22使用reprex v2.0.2
我建议从@Grzegorz中简化一个很棒的方法。dplyr::na_if
:就是这样
library(dplyr)
library(tidyr)
library(tidyverse)
crime_state %>%
mutate(State = na_if(State, '')) %>%
fill(State)
A tibble: 50 × 14
State City X Population1 Violent..crime Murder Rape2 Robbery Aggravated..assault Property..crime Burglary Larceny..theft Motor..vehicle..t…¹ Arson3
<chr> <chr> <int> <chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <int>
1 ALABAMA HUNTSVILLE4 2018 196,620 "" NA "" "" "" "" "" "" "" NA
2 ALABAMA TUSCALOOSA 2018 101,764 "253" 3 "20" "71" "159" "2,092" "414" "1,569" "109" NA
3 ARIZONA CHANDLER 2018 255,986 "287" 1 "74" "62" "150" "2,771" "369" "2,241" "161" 8
4 ARIZONA GILBERT 2018 247,463 "132" 1 "49" "15" "67" "1,705" "214" "1,409" "82" 5
5 ARIZONA GLENDALE 2018 249,799 "594" 7 "69" "171" "347" "4,830" "756" "3,581" "493" 34
6 ARIZONA MESA 2018 504,873 "967" 10 "143" "192" "622" "5,113" "751" "3,901" "461" 7
7 ARIZONA PEORIA 2018 170,177 "201" 2 "33" "29" "137" "1,596" "281" "1,224" "91" 4
8 ARIZONA PHOENIX 2018 1,653,080 "6,118" 67 "566" "1,438" "4,047" "28,854" "5,465" "19,461" "3,928" 140
9 ARIZONA SCOTTSDALE 2018 254,961 "211" 7 "53" "52" "99" "2,853" "350" "2,362" "141" 7
10 ARIZONA SURPRISE 2018 136,611 "67" 3 "10" "20" "34" "1,088" "138" "860" "90" 3
# … with 40 more rows, and abbreviated variable name ¹Motor..vehicle..theft
# ℹ Use `print(n = ...)` to see more rows