r语言 - 读取 CSV 文件以使用唯一标记行



我有许多数据集,它们包含超出某一行的额外信息。这些文件都是 csv。我将能够循环浏览它们并读取.csv带有"skip"参数以清理数据的顶部,但数据帧的长度都不同。唯一的共性是"总计"列中的"--------------- ---------------- ------ -----"行,该行将有意义的数据与其下方的摘要和无关信息分开。

这是我在没有skip = 14的情况下读取数据的方式(这是所有内容的标准(。

before<-read.csv("Example.csv", header = FALSE,
col.names = c("CountryID","Name","Type","Symbol","Code","Unit", 
"Total", "Measurement", "Value", "Percent", "CO2" ))

但是,-----标记可能是不同的行,但这是首先要命中的。以下是之前的数据:

structure(list(CountryID = structure(c(26L, 19L, 21L, 23L, 21L, 
7L, 1L, 1L, 1L, 22L, 3L, 1L, 19L, 2L, 8L, 14L, 15L, 13L, 9L, 
12L, 18L, 17L, 8L, 13L, 15L, 10L, 8L, 8L, 11L, 16L, 1L, 1L, 1L, 
20L, 4L, 6L, 1L, 25L, 5L, 1L, 1L, 1L, 24L, 1L), .Label = c("", 
"------------", "-------------", "---------------", "------------------", 
" ", "08.15.1997", "10000", "15000", "200", "2000", "2500", "3000", 
"45000", "5000", "7000", "8000", "8300", "Country", "Output", 
"Production", "Quantity", "Serial Output", "TOTAL SUM", "Unaccounted", 
"United Nations Data"), class = "factor"), Name = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 20L, 2L, 1L, 1L, 1L, 21L, 4L, 
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 
1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 19L, 1L, 1L, 1L, 1L), .Label = c("", 
"--------------------", " ", "Bahrain", "Bangladesh", "Barbados", 
"Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia", 
"Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei", "Bulgaria", 
"Burkina Faso", "Chad", "Name", "The Bahamas"), class = "factor"), 
Type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 4L, 
2L, 1L, 1L, 1L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 6L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L), .Label = c("", "----", " ", "Code", "Type", 
"Unit"), class = "factor"), Symbol = structure(c(1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 20L, 22L, 2L, 1L, 1L, 1L, 4L, 5L, 
6L, 7L, 9L, 8L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 
19L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 21L, 1L, 1L, 1L, 
1L), .Label = c("", "------------", " ", "BAHM", "BAHR", 
"BANG", "BARB", "BELGM", "BELS", "BELZ", "BEN", "BHUT", "BOL", 
"BOSHER", "BOTS", "BRAZ", "BRUN", "BULG", "BURKF", "Country", 
"private", "Symbol"), class = "factor"), Code = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 19L, 2L, 1L, 1L, 1L, 12L, 
15L, 11L, 17L, 4L, 13L, 14L, 9L, 18L, 10L, 5L, 16L, 3L, 7L, 
8L, 6L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L), .Label = c("", "------------", "1504944270", "2287368539", 
"2388991307", "2453202442", "2561470743", "3205402223", "3221488867", 
"3230369605", "3247578406", "3712013344", "4307638090", "462793263", 
"4835205752", "4854959101", "5842098895", "5932776587", "Code"
), class = "factor"), Unit = structure(c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 17L, 16L, 2L, 1L, 1L, 1L, 7L, 9L, 10L, 14L, 
12L, 15L, 15L, 11L, 13L, 3L, 8L, 13L, 15L, 6L, 5L, 9L, 1L, 
1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", 
"-------------", "100", "1109", "27", "35", "40", "45", "58", 
"70", "74", "77", "79", "82", "95", "Output", "Per Unit"), class = "factor"), 
Total = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 25L, 
24L, 2L, 1L, 1L, 1L, 18L, 5L, 17L, 8L, 23L, 20L, 6L, 9L, 
7L, 11L, 12L, 13L, 19L, 15L, 14L, 10L, 3L, 16L, 1L, 1L, 1L, 
16L, 1L, 1L, 1L, 21L, 1L, 3L, 22L, 4L), .Label = c("", "---------------", 
"---------------            ----------------  ------  -----", 
"===============            ================  ======  =====", 
"126912", "147431", "170553", "175973", "203728", "230761", 
"293789", "304471", "376281", "386526", "399160", "4417002", 
"476025", "478030", "502999", "51012", "5610654", "56406056", 
"93351", "Output", "Total"), class = "factor"), Measurement = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 12L, 2L, 1L, 1L, 1L, 3L, 
9L, 3L, 4L, 10L, 9L, 6L, 4L, 5L, 10L, 7L, 9L, 4L, 8L, 10L, 
9L, 1L, 1L, 1L, 1L, 1L, 11L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L), .Label = c("", "--------", "20", "23", "24", "26", "27", 
"28", "29", "30", "420", "Measurement"), class = "factor"), 
Value = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 22L, 
23L, 2L, 1L, 1L, 1L, 5L, 19L, 11L, 8L, 3L, 18L, 13L, 6L, 
4L, 9L, 14L, 17L, 7L, 10L, 12L, 15L, 1L, 16L, 1L, 1L, 1L, 
16L, 1L, 1L, 1L, 20L, 1L, 1L, 21L, 1L), .Label = c("", "----------------", 
"15150240", "15891735", "16083459", "16959919", "20350968", 
"20909501", "21770264", "25121096", "27726279", "30024743", 
"34069742", "34841369", "38498281", "468004111", "49524999", 
"50512814", "50568702", "540650", "64506", "Country", "Value"
), class = "factor"), Percent = structure(c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 23L, 2L, 1L, 1L, 1L, 11L, 12L, 8L, 3L, 
17L, 16L, 5L, 10L, 20L, 9L, 6L, 7L, 4L, 15L, 14L, 22L, 1L, 
13L, 1L, 1L, 1L, 21L, 1L, 1L, 1L, 19L, 1L, 1L, 18L, 1L), .Label = c("", 
"------", "102", "104", "106", "112", "126", "129", "142", 
"15", "160", "177", "1775", "180", "191", "24", "25", "5640645", 
"650163", "87", "887.5", "95", "Production Percent"), class = "factor"), 
CO2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 15L, 14L, 
2L, 1L, 1L, 1L, 9L, 4L, 9L, 7L, 4L, 5L, 4L, 7L, 4L, 9L, 4L, 
11L, 4L, 12L, 10L, 4L, 1L, 6L, 1L, 1L, 1L, 8L, 1L, 1L, 1L, 
3L, 1L, 1L, 13L, 1L), .Label = c("", "-----", "?", "0", "0.2", 
"0.6", "1", "19.4", "2", "2.2", "4", "5", "564065", "CO2", 
"Cur."), class = "factor")), class = "data.frame", row.names = c(NA, 
-44L))

以下是我希望它的外观:

structure(list(CountryID = c(10000L, 45000L, 5000L, 3000L, 15000L, 
2500L, 8300L, 8000L, 10000L, 3000L, 5000L, 200L, 10000L, 10000L, 
2000L, 7000L), Name = structure(c(16L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), .Label = c("Bahrain", 
"Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin", 
"Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana", "Brazil", 
"Brunei", "Bulgaria", "Burkina Faso", "The Bahamas"), class = "factor"), 
Type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L), .Label = "Unit", class = "factor"), 
Symbol = structure(c(1L, 2L, 3L, 4L, 6L, 5L, 7L, 8L, 9L, 
10L, 11L, 12L, 13L, 14L, 15L, 16L), .Label = c("BAHM", "BAHR", 
"BANG", "BARB", "BELGM", "BELS", "BELZ", "BEN", "BHUT", "BOL", 
"BOSHER", "BOTS", "BRAZ", "BRUN", "BULG", "BURKF"), class = "factor"), 
Code = c(3712013344, 4835205752, 3247578406, 5842098895, 
2287368539, 4307638090, 462793263, 3221488867, 5932776587, 
3230369605, 2388991307, 4854959101, 1504944270, 2561470743, 
3205402223, 2453202442), Unit = c(40L, 58L, 70L, 82L, 77L, 
95L, 95L, 74L, 79L, 100L, 45L, 79L, 95L, 35L, 27L, 58L), 
Total = c(478030L, 126912L, 476025L, 175973L, 93351L, 51012L, 
147431L, 203728L, 170553L, 293789L, 304471L, 376281L, 502999L, 
399160L, 386526L, 230761L), Measurement = c(20L, 29L, 20L, 
23L, 30L, 29L, 26L, 23L, 24L, 30L, 27L, 29L, 23L, 28L, 30L, 
29L), Value = c(16083459L, 50568702L, 27726279L, 20909501L, 
15150240L, 50512814L, 34069742L, 16959919L, 15891735L, 21770264L, 
34841369L, 49524999L, 20350968L, 25121096L, 30024743L, 38498281L
), Percent = c(160L, 177L, 129L, 102L, 25L, 24L, 106L, 15L, 
87L, 142L, 112L, 126L, 104L, 191L, 180L, 95L), CO2 = c(2, 
0, 2, 1, 0, 0.2, 0, 1, 0, 2, 0, 4, 0, 5, 2.2, 0)), class = "data.frame", row.names = c(NA, 
-16L))

这可以集成到 read.csv 参数中,还是以其他方式更容易清理它的底部。

三个想法:

  1. 使用readLines(如@user2554330建议的那样(,查找/删除特定行,对其进行过滤,然后使用read.csv解析文本向量,这是三者中最少的。

  2. before[seq_len(min(head(which(!grepl("^[^- ]+$", before$Total)),1)-1L,nrow(before))),]; 当然,有点复杂,但它可以满足您的需求(假设您已经用skip=过滤了前 14 行。

  3. 使用外部脚本,例如pipe(...)类型事物中的sed -e '1,14d;/^[ -]+$/{g;q;}

读两遍。 第一次,使用readLines("Example.csv"),并浏览各行以查找数据末尾的标记。 假设它在第 n 行。 然后在第二次读取时,使用

read.csv("Example.csv", header = FALSE,
col.names = c("CountryID","Name","Type","Symbol","Code","Unit", 
"Total", "Measurement", "Value", "Percent", "CO2" ), nrows = n - 1)

(或者,如果您要跳过某些值,可能需要nrows不同的值(。

最新更新