如何使用R中每行的第一个文本值来填充空行



数据

以下是我正在处理的数据子集的dput

crime_state <- structure(list(State = c("ALABAMA", "", "ARIZONA", "", "", "", 
"", "", "", "", "", "", "ARKANSAS", "CALIFORNIA", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
""), City = c("HUNTSVILLE4", "TUSCALOOSA", "CHANDLER", "GILBERT", 
"GLENDALE", "MESA", "PEORIA", "PHOENIX", "SCOTTSDALE", "SURPRISE", 
"TEMPE", "TUCSON", "LITTLE ROCK", "ANAHEIM", "ANTIOCH", "BAKERSFIELD", 
"BERKELEY", "BURBANK", "CARLSBAD", "CHULA VISTA", "CLOVIS", "CONCORD", 
"CORONA", "COSTA MESA", "DALY CITY", "DOWNEY", "EL CAJON", "EL MONTE", 
"ELK GROVE", "ESCONDIDO", "FAIRFIELD", "FONTANA", "FREMONT", 
"FRESNO", "FULLERTON", "GARDEN GROVE", "GLENDALE", "HAYWARD", 
"HUNTINGTON BEACH", "INGLEWOOD", "IRVINE", "JURUPA VALLEY", "LANCASTER", 
"LONG BEACH", "LOS ANGELES", "MODESTO", "MORENO VALLEY", "MURRIETA", 
"NORWALK", "OAKLAND"), X = c(2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L
), Population1 = c("196,620", "101,764", "255,986", "247,463", 
"249,799", "504,873", "170,177", "1,653,080", "254,961", "136,611", 
"188,543", "537,392", "199,288", "354,743", "112,956", "385,609", 
"123,735", "105,041", "116,739", "274,370", "111,759", "130,855", 
"170,041", "114,358", "107,928", "113,277", "104,497", "116,464", 
"174,651", "153,073", "117,883", "213,964", "238,024", "531,818", 
"141,132", "174,661", "204,724", "162,881", "203,428", "110,726", 
"288,052", "107,605", "160,818", "470,445", "4,029,741", "215,822", 
"209,145", "114,706", "106,158", "430,230"), Violent..crime = c("", 
"253", "287", "132", "594", "967", "201", "6,118", "211", "67", 
"448", "2,027", "1,336", "578", "283", "911", "270", "129", "115", 
"406", "106", "246", "127", "170", "113", "174", "246", "181", 
"198", "237", "335", "331", "274", "1,489", "174", "266", "83", 
"330", "218", "349", "82", "134", "586", "1,702", "14,709", "963", 
"396", "49", "219", "2,675"), Murder = c(NA, 3L, 1L, 1L, 7L, 
10L, 2L, 67L, 7L, 3L, 3L, 26L, 19L, 2L, 1L, 18L, 1L, 0L, 1L, 
4L, 1L, 1L, 4L, 3L, 2L, 3L, 2L, 2L, 0L, 1L, 2L, 5L, 1L, 18L, 
0L, 2L, 0L, 0L, 0L, 7L, 0L, 5L, 4L, 11L, 132L, 8L, 7L, 2L, 5L, 
32L), Rape2 = c("", "20", "74", "49", "69", "143", "33", "566", 
"53", "10", "75", "255", "121", "70", "24", "60", "31", "7", 
"19", "43", "23", "27", "27", "35", "18", "7", "20", "12", "16", 
"28", "53", "27", "32", "93", "27", "28", "16", "55", "41", "27", 
"22", "8", "49", "107", "1,324", "47", "17", "5", "4", "214"), 
Robbery = c("", "71", "62", "15", "171", "192", "29", "1,438", 
"52", "20", "97", "670", "150", "195", "102", "402", "167", 
"48", "23", "114", "15", "103", "49", "59", "45", "83", "86", 
"78", "40", "71", "89", "94", "115", "473", "66", "90", "24", 
"170", "59", "156", "22", "45", "142", "527", "5,139", "204", 
"141", "18", "61", "1,220"), Aggravated..assault = c("", 
"159", "150", "67", "347", "622", "137", "4,047", "99", "34", 
"273", "1,076", "1,046", "311", "156", "431", "71", "74", 
"72", "245", "67", "115", "47", "73", "48", "81", "138", 
"89", "142", "137", "191", "205", "126", "905", "81", "146", 
"43", "105", "118", "159", "38", "76", "391", "1,057", "8,114", 
"704", "231", "24", "149", "1,209"), Property..crime = c("", 
"2,092", "2,771", "1,705", "4,830", "5,113", "1,596", "28,854", 
"2,853", "1,088", "3,567", "13,925", "6,236", "4,399", "1,479", 
"8,330", "2,581", "1,362", "1,081", "1,905", "1,271", "2,151", 
"1,535", "1,911", "719", "1,353", "1,164", "1,216", "1,128", 
"1,370", "1,639", "1,683", "2,387", "8,628", "1,878", "2,096", 
"1,491", "2,434", "1,871", "1,382", "1,670", "1,389", "1,855", 
"5,936", "50,093", "3,965", "2,933", "661", "1,057", "10,677"
), Burglary = c("", "414", "369", "214", "756", "751", "281", 
"5,465", "350", "138", "456", "1,690", "1,068", "733", "278", 
"1,997", "403", "141", "169", "307", "160", "261", "191", 
"265", "119", "243", "166", "257", "147", "187", "211", "289", 
"404", "1,411", "176", "329", "188", "262", "227", "221", 
"276", "201", "512", "1,077", "7,869", "564", "591", "128", 
"222", "1,160"), Larceny..theft = c("", "1,569", "2,241", 
"1,409", "3,581", "3,901", "1,224", "19,461", "2,362", "860", 
"2,841", "10,959", "4,643", "2,969", "910", "4,875", "1,915", 
"1,115", "830", "1,244", "1,024", "1,611", "1,082", "1,486", 
"517", "785", "827", "677", "892", "933", "1,121", "928", 
"1,620", "6,126", "1,460", "1,438", "1,156", "1,468", "1,485", 
"822", "1,307", "791", "981", "3,688", "33,364", "2,819", 
"1,828", "435", "604", "7,021"), Motor..vehicle..theft = c("", 
"109", "161", "82", "493", "461", "91", "3,928", "141", "90", 
"270", "1,276", "525", "697", "291", "1,458", "263", "106", 
"82", "354", "87", "279", "262", "160", "83", "325", "171", 
"282", "89", "250", "307", "466", "363", "1,091", "242", 
"329", "147", "704", "159", "339", "87", "397", "362", "1,171", 
"8,860", "582", "514", "98", "231", "2,496"), Arson3 = c(NA, 
NA, 8L, 5L, 34L, 7L, 4L, 140L, 7L, 3L, 3L, 92L, 24L, 13L, 
28L, 116L, 21L, 7L, 5L, 9L, 2L, 12L, 10L, 12L, 3L, 8L, 9L, 
10L, 6L, 5L, 11L, 5L, 6L, 129L, 8L, 6L, 5L, 9L, 9L, 7L, 2L, 
0L, 14L, 57L, 897L, 26L, 5L, 2L, 4L, 106L)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -50L))

问题

我有这样的数据,每个州都有多个县,因此州行有几个空值:

# A tibble: 276 × 14
State     City         X Popul…¹ Viole…² Murder Rape2 Robbery Aggra…³
<chr>     <chr>    <int> <chr>   <chr>    <int> <chr> <chr>   <chr>  
1 "ALABAMA" HUNTSVI…  2018 196,620 ""          NA ""    ""      ""     
2 ""        TUSCALO…  2018 101,764 "253"        3 "20"  "71"    "159"  
3 "ARIZONA" CHANDLER  2018 255,986 "287"        1 "74"  "62"    "150"  
4 ""        GILBERT   2018 247,463 "132"        1 "49"  "15"    "67"   
5 ""        GLENDALE  2018 249,799 "594"        7 "69"  "171"   "347"  
6 ""        MESA      2018 504,873 "967"       10 "143" "192"   "622"  
7 ""        PEORIA    2018 170,177 "201"        2 "33"  "29"    "137"  
8 ""        PHOENIX   2018 1,653,… "6,118"     67 "566" "1,438" "4,047"
9 ""        SCOTTSD…  2018 254,961 "211"        7 "53"  "52"    "99"   
10 ""        SURPRISE  2018 136,611 "67"         3 "10"  "20"    "34"

对于这部分数据,阿拉巴马州应填写前两行,亚利桑那州应填写最后8行空的数据。6年前,我尝试使用这篇文章中的方法,但使用了以下代码(提供as.character,因为这里的大多数数据都是这样的(:

library(tidyverse)
crime_state %>%
mutate_all(as.character) %>%
fill(names(.), 
.direction = "up")

我需要修理的行没有零钱。我的数据有更好的替代方案吗?这将是理想的tibble:

# A tibble: 276 × 14
State     City     X     Popul…¹ Viole…² Murder Rape2 Robbery Aggra…³
<chr>     <chr>    <chr> <chr>   <chr>   <chr>  <chr> <chr>   <chr>  
1 "ALABAMA" HUNTSVI… 2018  196,620 ""      3      ""    ""      ""     
2 "ALABAMA" TUSCALO… 2018  101,764 "253"   3      "20"  "71"    "159"  
3 "ARIZONA" CHANDLER 2018  255,986 "287"   1      "74"  "62"    "150"  
4 "ARIZONA" GILBERT  2018  247,463 "132"   1      "49"  "15"    "67"   
5 "ARIZONA" GLENDALE 2018  249,799 "594"   7      "69"  "171"   "347"  
6 "ARIZONA" MESA     2018  504,873 "967"   10     "143" "192"   "622"  
7 "ARIZONA" PEORIA   2018  170,177 "201"   2      "33"  "29"    "137"  
8 "ARIZONA" PHOENIX  2018  1,653,… "6,118" 67     "566" "1,438" "4,047"
9 "ARIZONA" SCOTTSD… 2018  254,961 "211"   7      "53"  "52"    "99"   
10 "ARIZONA" SURPRISE 2018  136,611 "67"    3      "10"  "20"    "34" 

关于:

[...]
crime_state |>
dplyr::mutate(State = ifelse(State == "", NA, State)) |>
tidyr::fill(State)
#> # A tibble: 50 × 14
#>    State   City           X Popul…¹ Viole…² Murder Rape2 Robbery Aggra…³ Prope…⁴
#>    <chr>   <chr>      <int> <chr>   <chr>    <int> <chr> <chr>   <chr>   <chr>  
#>  1 ALABAMA HUNTSVILL…  2018 196,620 ""          NA ""    ""      ""      ""     
#>  2 ALABAMA TUSCALOOSA  2018 101,764 "253"        3 "20"  "71"    "159"   "2,092"
#>  3 ARIZONA CHANDLER    2018 255,986 "287"        1 "74"  "62"    "150"   "2,771"
#>  4 ARIZONA GILBERT     2018 247,463 "132"        1 "49"  "15"    "67"    "1,705"
#>  5 ARIZONA GLENDALE    2018 249,799 "594"        7 "69"  "171"   "347"   "4,830"
#>  6 ARIZONA MESA        2018 504,873 "967"       10 "143" "192"   "622"   "5,113"
#>  7 ARIZONA PEORIA      2018 170,177 "201"        2 "33"  "29"    "137"   "1,596"
#>  8 ARIZONA PHOENIX     2018 1,653,… "6,118"     67 "566" "1,438" "4,047" "28,85…
#>  9 ARIZONA SCOTTSDALE  2018 254,961 "211"        7 "53"  "52"    "99"    "2,853"
#> 10 ARIZONA SURPRISE    2018 136,611 "67"         3 "10"  "20"    "34"    "1,088"
#> # … with 40 more rows, 4 more variables: Burglary <chr>, Larceny..theft <chr>,
#> #   Motor..vehicle..theft <chr>, Arson3 <int>, and abbreviated variable names
#> #   ¹​Population1, ²​Violent..crime, ³​Aggravated..assault, ⁴​Property..crime

创建于2022-10-22使用reprex v2.0.2

我建议从@Grzegorz中简化一个很棒的方法。dplyr::na_if:就是这样

library(dplyr)
library(tidyr)
library(tidyverse)
crime_state %>%
mutate(State = na_if(State, '')) %>%
fill(State)
A tibble: 50 × 14
State   City            X Population1 Violent..crime Murder Rape2 Robbery Aggravated..assault Property..crime Burglary Larceny..theft Motor..vehicle..t…¹ Arson3
<chr>   <chr>       <int> <chr>       <chr>           <int> <chr> <chr>   <chr>               <chr>           <chr>    <chr>          <chr>                <int>
1 ALABAMA HUNTSVILLE4  2018 196,620     ""                 NA ""    ""      ""                  ""              ""       ""             ""                      NA
2 ALABAMA TUSCALOOSA   2018 101,764     "253"               3 "20"  "71"    "159"               "2,092"         "414"    "1,569"        "109"                   NA
3 ARIZONA CHANDLER     2018 255,986     "287"               1 "74"  "62"    "150"               "2,771"         "369"    "2,241"        "161"                    8
4 ARIZONA GILBERT      2018 247,463     "132"               1 "49"  "15"    "67"                "1,705"         "214"    "1,409"        "82"                     5
5 ARIZONA GLENDALE     2018 249,799     "594"               7 "69"  "171"   "347"               "4,830"         "756"    "3,581"        "493"                   34
6 ARIZONA MESA         2018 504,873     "967"              10 "143" "192"   "622"               "5,113"         "751"    "3,901"        "461"                    7
7 ARIZONA PEORIA       2018 170,177     "201"               2 "33"  "29"    "137"               "1,596"         "281"    "1,224"        "91"                     4
8 ARIZONA PHOENIX      2018 1,653,080   "6,118"            67 "566" "1,438" "4,047"             "28,854"        "5,465"  "19,461"       "3,928"                140
9 ARIZONA SCOTTSDALE   2018 254,961     "211"               7 "53"  "52"    "99"                "2,853"         "350"    "2,362"        "141"                    7
10 ARIZONA SURPRISE     2018 136,611     "67"                3 "10"  "20"    "34"                "1,088"         "138"    "860"          "90"                     3
# … with 40 more rows, and abbreviated variable name ¹​Motor..vehicle..theft
# ℹ Use `print(n = ...)` to see more rows

最新更新