


#Generate two data files to be imported
cat("Quantify Compound Summary Reportn", 
"nPrinted Mon March 28 14:54:39 2022n", 
"nCompound 1: Onen", 
"nCompound 2: Twon", 
file = "test1.txt")
## Step 1: ID list of data files
data.files <- list.files(path = ".",
pattern = ".txt",
full.names = TRUE)
## Step 2: Read in the data files
data.list.raw <- lapply(data.files, read_lines, skip = 4) 
## Step 3: Identify the "compounds" in the data file output  
Hdr.dat <- lapply(data.list.raw, function(x) grepl("Compound", x)) # Scan the file and find the different compounds within it (this can be applied to any Waters output)
grp.dat <- Map(function(x, y) {x[y][cumsum(y)]}, data.list.raw, Hdr.dat)
## Step 4: Unpack the tab delimited parts of the export file, then generate a list of dataframes within a list of imported files
Read <- function(x) read.table(text = x, sep = "t", fill = TRUE, stringsAsFactors = FALSE)
raw.dat <- Map(function(x,y) {Map(Read, split(x, y))}, data.list.raw, grp.dat)
## Step 5: Curate the list of compounds - remove "Compound X: " 
cmpd.list <- lapply(raw.dat, function(x) trimws(substring(names(x), 13))) 
## Step 6: Rename the headers for the dataframes, remove the blank rows and recentre 
NameCols <- function(z) lapply(names(z), function(i){
x <- z[[ i ]]
colnames(x) <- x[2,]
data.list <- Map(function(x,y){setNames(NameCols(x), y)}, raw.dat, cmpd.list) 
## Step 7: rbind the data based on the compound 
cmpd_names <- unique(unlist(sapply(data.list, names)))
result <- list()
j <- for (n in cmpd_names) {
result[[n]] <- map(data.list, n)
list.merged <- map(result, dplyr::bind_rows)
list.merged <- lapply(list.merged, function(x) x %>% filter(Name != ""))

这里的挑战是脚本效率和时间(我可以导入数百或数千个具有数百行数据的数据文件,这可能需要相当长的时间(;清洁度";,这就是为什么我在这里包括CCD_ 1作为标签。我还希望这是高度可概括的;化合物";可能会随时间变化。如果有人能想出一个干净高效的方法来做这一切,我将永远欠你的债。


intermediate_result <-
data.frame(file_name = c('test1.txt','test2.txt')) %>%
rowwise %>%
## read file content into a raw string:
mutate(raw = read_file(file_name)) %>%
## separate raw file contents into rows 
## using newline and carriage return as row delimiters:
separate_rows(raw, sep = '[\n\r]') %>%
## provide a compound column for later grouping
## by extracting the 'Compound' string from column raw
## or setting the compound column to NA otherwise:
mutate(compound = ifelse(grepl('^Compound',raw),
gsub('.*(Compound .*):.*','\1', raw),
) %>%
## remove rows with empty raw text:
filter(raw != '') %>%
## filling missing compound values (NAs) with last non-NA compound string:
fill(compound, .direction = 'down') %>%
## keep only rows with tab-separated raw string
## indicating tabular data
filter(grepl('\t',raw)) %>%
## insert a column header 'Index' because
## original format has four data columns but only three header cols:
mutate(raw = gsub(' *\tName','IndextName',raw))



intermediate_result %>%
group_by(compound) %>%
## the nifty piece: you can store dataframes inside a dataframe:
tables = list(read.table(text = raw, header = TRUE, sep = 't' ))


intermediate_result %>%
split(f = as.factor(.$compound)) %>% 
lapply(function(x) x %>%
into = unlist(
str_split(x$raw[1], pattern = "t"))
