我通过导入包含文件的文件夹结构创建了一个简单的data.tree
。
if (!require("pacman")) install.packages("pacman")
pacman::p_load_gh("trinker/pathr")
library(pathr)
library(data.tree)
folder_structure <- pathr::tree(path = "/Users/username/Downloads/top_level/",
use.data.tree = T, include.files = T)
现在,我想将对象folder_structure
转换为一个data.frame
,每个文件夹一行,并指定每个文件夹包含多少个文件。我怎样才能做到这一点?
例如,我有这个非常简单的文件夹结构:
top_level_folder
sub_folder_1
file1.txt
sub_folder_2
file2.txt
回答该问题将涉及创建如下所示的输出:
Folders Files
top_level_folder 0
sub_folder_1 1
sub_folder_2 1
第一列可以通过调用list.dirs("/Users/username/Downloads/top_level/")
简单地生成,但我不知道如何生成第二列。请注意,第二列是非递归的,这意味着子文件夹中的文件不被计算在内(即 top_level_folder
包含 0
个文件,即使 top_level_folder
的子文件夹包含 2 个文件(。
如果你想看看你的解决方案是否可以扩展,下载Rails代码库:https://github.com/rails/rails/archive/master.zip 并在Rails更复杂的文件结构上尝试一下。
>list.dirs()
提供了可从起始文件夹访问的每个子目录的向量,以便处理数据帧的第一列。非常方便。
# Get a vector of all the directories and subdirectories from this folder
dir <- "."
xs <- list.dirs(dir, recursive = TRUE)
list.files()
可以告诉我们每个文件夹的内容,但它包括文件和文件夹。我们只想要文件。要获取文件计数,我们需要使用谓词过滤list.files()
的输出。 file.info()
可以告诉我们给定文件是否是目录,因此我们从中构建谓词。
# Helper to check if something is folder or file
is_dir <- function(x) file.info(x)[["isdir"]]
is_file <- Negate(is_dir)
现在,我们解决如何在单个文件夹中获取文件数量。对布尔值求和将返回TRUE
事例数。
# Count the files in a single folder
count_files_in_one_dir <- function(dir) {
files <- list.files(dir, full.names = TRUE)
sum(is_file(files))
}
为方便起见,我们包装了该函数以使其适用于许多文件夹。
# Vectorized version of the above
count_files_in_dir <- function(dir) {
vapply(dir, count_files_in_one_dir, numeric(1), USE.NAMES = FALSE)
}
现在我们可以计算文件了。
df <- tibble::data_frame(
dir = xs,
nfiles = count_files_in_dir(xs))
df
#> # A tibble: 688 x 2
#> dir nfiles
#> <chr> <dbl>
#> 1 . 11
#> 2 ./.github 3
#> 3 ./actioncable 7
#> 4 ./actioncable/app 0
#> 5 ./actioncable/app/assets 0
#> 6 ./actioncable/app/assets/javascripts 1
#> 7 ./actioncable/app/assets/javascripts/action_cable 5
#> 8 ./actioncable/bin 1
#> 9 ./actioncable/lib 1
#> 10 ./actioncable/lib/action_cable 8
#> # ... with 678 more rows
您可以使用dplyr
链和 pathr
包中的 parse_path()
函数。tree
函数基本上只是parse_path
的包装器,因此更容易直接使用parse_path
。例如像这样:
library(pathr)
library(dplyr)
fls <- dir("C:/RBuildTools/3.3", recursive = T, full.names = T) %>%
parse_path() %>%
index(4) %>% # this is where you indicate the level or "depth"
# of the folder of which want subfolder file counts
data.frame(folders = .) %>%
group_by(folders) %>%
tally() %>%
arrange(n)
# if you want to get rid of all the files in your starting folder
# just add a
# filter(folder > 1) at the end of the dplyr chain
对我来说,上面的代码产生以下结果:
> fls
# A tibble: 12 × 2
folders n
<fctr> <int>
1 COPYING 1
2 README.txt 1
3 Rtools.txt 1
4 unins000.dat 1
5 unins000.exe 1
6 VERSION.txt 1
7 bin 56
8 mingw_libs 200
9 texinfo5 356
10 gcc-4.6.3 3787
11 mingw_32 13707
12 mingw_64 14619
dir.create("top_level_folder")
dir.create("top_level_folder/sub_folder_1")
dir.create("top_level_folder/sub_folder_2")
a <- "hello"
save(a,file = "top_level_folder/sub_folder_1/file1.txt")
save(a,file = "top_level_folder/sub_folder_2/file2.txt")
path <- "top_level_folder"
files <- list.files(path, recursive=TRUE)
folders <- sapply(strsplit(files,"/"),function(x){x[length(x)-1]})
output <- setNames(as.data.frame(table(unlist(folders))),c("Folders","Files"))
all_folders <- data.frame(Folders = list.dirs(path,full.names=FALSE,recursive=TRUE),stringsAsFactors=FALSE)
all_folders$Folders[1] <- strsplit(path,",")[[1]][length(strsplit(path,",")[[1]])]
output <- merge(all_folders,output,all.x = TRUE)
output$Files[is.na(output$Files)] <- 0
output <- output[match(all_folders$Folders,output$Folders),]
# Folders Files
# 3 top_level_folder 0
# 1 sub_folder_1 1
# 2 sub_folder_2 1
您真正需要做的就是使用 list.dirs
列出目录(默认为 recursive = TRUE
(并对其进行迭代,找到该目录的list.files
长度(默认为 recursive = FALSE
(。整齐到一个漂亮的数据帧,
library(purrr)
files <- .libPaths()[1] %>% # omit for current directory or supply alternate path
list.dirs() %>%
map_df(~list(path = .x,
files = length(list.files(.x))))
files
#> # A tibble: 4,457 x 2
#> path files
#> <chr> <int>
#> 1 /Library/Frameworks/R.framework/Versions/3.4/Resources/library 314
#> 2 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind 9
#> 3 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/help 5
#> 4 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/html 2
#> 5 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/Meta 6
#> 6 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/R 3
#> 7 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/acepack 14
#> 8 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/acepack/help 5
#> 9 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/acepack/html 2
#> 10 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/acepack/libs 2
#> # ... with 4,447 more rows
或者如果您愿意,可以在底座上全部使用,
files <- do.call(rbind, lapply(list.dirs(.libPaths()[1]), function(path){
data.frame(path = path,
files = length(list.files(path)),
stringsAsFactors = FALSE)
}))
head(files)
#> path files
#> 1 /Library/Frameworks/R.framework/Versions/3.4/Resources/library 314
#> 2 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind 9
#> 3 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/help 5
#> 4 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/html 2
#> 5 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/Meta 6
#> 6 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/R 3
这是一个非常紧凑的解决方案:
print(folder_structure,
files = function(node) sum(Get(node$children, 'isLeaf')),
filterFun = isNotLeaf,
pruneMethod = NULL
)
这会产生如下所示的内容:
levelName files
1 data.tree 16
2 ¦--data 2
3 ¦--data_gen 2
4 ¦--.git 8
5 ¦ ¦--hooks 9
6 ¦ ¦--info 1
7 ¦ ¦--logs 1
8 ¦ ¦ °--refs 1
9 ¦ ¦ ¦--heads 4
10 ¦ ¦ ¦--remotes 0
11 ¦ ¦ ¦ °--origin 5
12 ¦ ¦--objects 0
13 ¦ ¦ ¦--01 4
14 ¦ ¦ ¦--02 5
...
但请注意,这也将空文件夹计为文件。
list.files
返回所有文件和目录路径。没有is.file
功能,但有dir.exists
.由于我们知道所有路径都是实际节点,因此那些不是目录的路径将计为文件。
top_level <- '~/rails-master'
setwd(top_level)
subitems <- data.frame(
path = list.files(
include.dirs = TRUE,
recursive = TRUE
),
stringsAsFactors = FALSE
)
subitems$is_file <- !dir.exists(subitems$path)
对于每一行,如果路径是目录,则它是它自己的目录路径。 如果路径是文件的路径,则其父级是目录路径。然后,只需通过目录路径计算is_file
为真的频率即可。
subitems$dir_path <- ifelse(
subitems$is_file,
dirname(subitems$path),
subitems$path
)
file_counts <- tapply(subitems$is_file, subitems$dir_path, sum)
result <- data.frame(
Folders = names(file_counts),
Files = file_counts
)