r语言 - 计算复杂文件夹结构中每个文件夹的文件数



我通过导入包含文件的文件夹结构创建了一个简单的data.tree

if (!require("pacman")) install.packages("pacman")
pacman::p_load_gh("trinker/pathr")
library(pathr)
library(data.tree)
folder_structure <- pathr::tree(path = "/Users/username/Downloads/top_level/",
 use.data.tree = T, include.files = T)

现在,我想将对象folder_structure转换为一个data.frame,每个文件夹一行,并指定每个文件夹包含多少个文件。我怎样才能做到这一点?

例如,我有这个非常简单的文件夹结构:

top_level_folder
    sub_folder_1
        file1.txt
    sub_folder_2
        file2.txt

回答该问题将涉及创建如下所示的输出:

Folders             Files
top_level_folder    0
sub_folder_1        1
sub_folder_2        1

第一列可以通过调用list.dirs("/Users/username/Downloads/top_level/")简单地生成,但我不知道如何生成第二列。请注意,第二列是非递归的,这意味着子文件夹中的文件不被计算在内(即 top_level_folder包含 0 个文件,即使 top_level_folder 的子文件夹包含 2 个文件(。

如果你想看看你的解决方案是否可以扩展,下载Rails代码库:https://github.com/rails/rails/archive/master.zip 并在Rails更复杂的文件结构上尝试一下。

>list.dirs()提供了可从起始文件夹访问的每个子目录的向量,以便处理数据帧的第一列。非常方便。

# Get a vector of all the directories and subdirectories from this folder
dir <- "."
xs <- list.dirs(dir, recursive = TRUE)

list.files()可以告诉我们每个文件夹的内容,但它包括文件和文件夹。我们只想要文件。要获取文件计数,我们需要使用谓词过滤list.files()的输出。 file.info()可以告诉我们给定文件是否是目录,因此我们从中构建谓词。

# Helper to check if something is folder or file
is_dir <- function(x) file.info(x)[["isdir"]]
is_file <- Negate(is_dir)

现在,我们解决如何在单个文件夹中获取文件数量。对布尔值求和将返回TRUE事例数。

# Count the files in a single folder
count_files_in_one_dir <- function(dir) {
  files <- list.files(dir, full.names = TRUE)
  sum(is_file(files))
}

为方便起见,我们包装了该函数以使其适用于许多文件夹。

# Vectorized version of the above
count_files_in_dir <- function(dir) {
  vapply(dir, count_files_in_one_dir, numeric(1), USE.NAMES = FALSE)
}

现在我们可以计算文件了。

df <- tibble::data_frame(
  dir = xs,
  nfiles = count_files_in_dir(xs))
df
#> # A tibble: 688 x 2
#>                                                  dir nfiles
#>                                                <chr>  <dbl>
#>  1                                                 .     11
#>  2                                         ./.github      3
#>  3                                     ./actioncable      7
#>  4                                 ./actioncable/app      0
#>  5                          ./actioncable/app/assets      0
#>  6              ./actioncable/app/assets/javascripts      1
#>  7 ./actioncable/app/assets/javascripts/action_cable      5
#>  8                                 ./actioncable/bin      1
#>  9                                 ./actioncable/lib      1
#> 10                    ./actioncable/lib/action_cable      8
#> # ... with 678 more rows

您可以使用dplyr链和 pathr 包中的 parse_path() 函数。tree函数基本上只是parse_path的包装器,因此更容易直接使用parse_path。例如像这样:

library(pathr)
library(dplyr)
fls <- dir("C:/RBuildTools/3.3", recursive = T, full.names = T) %>% 
parse_path() %>% 
index(4) %>% # this is where you indicate the level or "depth" 
             # of the folder of which want subfolder file counts
data.frame(folders = .) %>% 
group_by(folders) %>% 
tally() %>% 
arrange(n)
# if you want to get rid of all the files in your starting folder 
# just add a 
# filter(folder > 1) at the end of the dplyr chain

对我来说,上面的代码产生以下结果:

> fls
# A tibble: 12 × 2
        folders     n
         <fctr> <int>
1       COPYING     1
2    README.txt     1
3    Rtools.txt     1
4  unins000.dat     1
5  unins000.exe     1
6   VERSION.txt     1
7           bin    56
8    mingw_libs   200
9      texinfo5   356
10    gcc-4.6.3  3787
11     mingw_32 13707
12     mingw_64 14619
dir.create("top_level_folder")
dir.create("top_level_folder/sub_folder_1")
dir.create("top_level_folder/sub_folder_2")
a <- "hello"
save(a,file = "top_level_folder/sub_folder_1/file1.txt")
save(a,file = "top_level_folder/sub_folder_2/file2.txt")
path <- "top_level_folder"
files   <- list.files(path, recursive=TRUE)
folders <- sapply(strsplit(files,"/"),function(x){x[length(x)-1]})
output <- setNames(as.data.frame(table(unlist(folders))),c("Folders","Files"))
all_folders <- data.frame(Folders = list.dirs(path,full.names=FALSE,recursive=TRUE),stringsAsFactors=FALSE)
all_folders$Folders[1] <- strsplit(path,",")[[1]][length(strsplit(path,",")[[1]])]
output <- merge(all_folders,output,all.x = TRUE)
output$Files[is.na(output$Files)] <- 0
output <- output[match(all_folders$Folders,output$Folders),]
#            Folders Files
# 3 top_level_folder     0
# 1     sub_folder_1     1
# 2     sub_folder_2     1

您真正需要做的就是使用 list.dirs 列出目录(默认为 recursive = TRUE (并对其进行迭代,找到该目录的list.files长度(默认为 recursive = FALSE(。整齐到一个漂亮的数据帧,

library(purrr)
files <- .libPaths()[1] %>%    # omit for current directory or supply alternate path
    list.dirs() %>% 
    map_df(~list(path = .x, 
                 files = length(list.files(.x))))
files
#> # A tibble: 4,457 x 2
#>                                                                           path files
#>                                                                          <chr> <int>
#>  1              /Library/Frameworks/R.framework/Versions/3.4/Resources/library   314
#>  2        /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind     9
#>  3   /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/help     5
#>  4   /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/html     2
#>  5   /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/Meta     6
#>  6      /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/R     3
#>  7      /Library/Frameworks/R.framework/Versions/3.4/Resources/library/acepack    14
#>  8 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/acepack/help     5
#>  9 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/acepack/html     2
#> 10 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/acepack/libs     2
#> # ... with 4,447 more rows

或者如果您愿意,可以在底座上全部使用,

files <- do.call(rbind, lapply(list.dirs(.libPaths()[1]), function(path){
    data.frame(path = path, 
               files = length(list.files(path)), 
               stringsAsFactors = FALSE)
}))
head(files)
#>                                                                        path files
#> 1            /Library/Frameworks/R.framework/Versions/3.4/Resources/library   314
#> 2      /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind     9
#> 3 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/help     5
#> 4 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/html     2
#> 5 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/Meta     6
#> 6    /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/R     3

这是一个非常紧凑的解决方案:

print(folder_structure, 
      files = function(node) sum(Get(node$children, 'isLeaf')), 
      filterFun = isNotLeaf,
      pruneMethod = NULL
)

这会产生如下所示的内容:

                                                     levelName files
1   data.tree                                                     16
2    ¦--data                                                       2
3    ¦--data_gen                                                   2
4    ¦--.git                                                       8
5    ¦   ¦--hooks                                                  9
6    ¦   ¦--info                                                   1
7    ¦   ¦--logs                                                   1
8    ¦   ¦   °--refs                                               1
9    ¦   ¦       ¦--heads                                          4
10   ¦   ¦       ¦--remotes                                        0
11   ¦   ¦       ¦   °--origin                                     5
12   ¦   ¦--objects                                                0
13   ¦   ¦   ¦--01                                                 4
14   ¦   ¦   ¦--02                                                 5
...

但请注意,这也将空文件夹计为文件。

list.files返回所有文件和目录路径。没有is.file功能,但有dir.exists.由于我们知道所有路径都是实际节点,因此那些不是目录的路径将计为文件。

top_level <- '~/rails-master'
setwd(top_level)
subitems <- data.frame(
  path = list.files(
    include.dirs = TRUE,
    recursive    = TRUE
  ),
  stringsAsFactors = FALSE
)
subitems$is_file <- !dir.exists(subitems$path)

对于每一行,如果路径是目录,则它是它自己的目录路径。 如果路径是文件的路径,则其父级是目录路径。然后,只需通过目录路径计算is_file为真的频率即可。

subitems$dir_path <- ifelse(
  subitems$is_file,
  dirname(subitems$path),
  subitems$path
)
file_counts <- tapply(subitems$is_file, subitems$dir_path, sum)
result <- data.frame(
  Folders = names(file_counts),
  Files   = file_counts
)

相关内容

  • 没有找到相关文章

最新更新