如何在 R 中的多个单词文件上运行 for 循环



我有44个文档文件。从每个文件中,我需要提取客户名称和金额。我可以使用 read_document 命令并使用 grep 提取金额和客户名称来为一个文件执行此操作。当我对 44 个文件执行此操作时,我收到错误。不知道我错在哪里:

ls()
rm(list = ls())
files <- list.files("~/experiment", ".doc")
files
length(files)
for (i in length(files)){
library(textreadr)
read_document(files[i])
}

这是我在一个文件上运行的完整代码:

file <- "~/customer_full_file.docx"
library(textreadr)
full_customer_file <- read_document(file, skip = 0, remove.empty = TRUE, trim = TRUE)
#checking file is read correctly
head(full_customer_file)
tail(full_customer_file)
# Extracting Name
full_customer_file <- full_customer_file[c(1,4)]
amount_extract <- grep("Amount", full_customer_file, value = T)
library(tm)
require(stringr)
amount_extract_2 <- lapply(amount_extract, stripWhitespace)
amount_extract_2 <- str_remove(marks_extract_2, "Amount")
name_extract <- grep("Customer Name and ID: ", full_customer_file, value = T)
name_extract
name_extract_2 <- lapply(name_extract, stripWhitespace)
name_extract_2 <- str_remove(name_extract_2, "Customer Name and ID: ")
name_extract_2 <- as.data.frame(name_extract_2)
names(name_extract_2)[1]  <- paste("customer_full_name")
amount_extract_2 <- as.data.frame(amount_extract_2)
names(amount_extract_2)[1] <- paste("amount")
amount_extract_2
customer_final_file <- cbind(name_extract_2, amount_extract_2)
write.table(customer_final_file, "~/customer_amount.csv", sep = ",", col.names = T, append = T)

这是我在 44 文件上运行的代码

ls()
rm(list = ls())
files <- list.files("~/experiment", ".doc")
files
length(files)
library(textreadr)
for (i in 1:length(files)){
read_document(files[i])
}

这是我收到的错误:

> library(textreadr)
> for (i in 1:length(files)){
+   read_document(files[i])
+   }
Warning messages:
1: In utils::unzip(file, exdir = tmp) :
error 1 in extracting from zip file
2: In utils::unzip(file, exdir = tmp) :
error 1 in extracting from zip file
3: In utils::unzip(file, exdir = tmp) :
error 1 in extracting from zip file
4: In utils::unzip(file, exdir = tmp) :
error 1 in extracting from zip file
5: In utils::unzip(file, exdir = tmp) :
error 1 in extracting from zip file

我可以给你我的代码,我用它来通过R中的sentimentr包分析不同的word文件。我想您可以使用与我相同的结构,只需更改 for in 函数即可循环每个 docx 的提取。

这是代码:

library(sentimentr)
folder_path <- "C:\Users\yourname\Documents\R\"
# Get a list of all the docx files in the folder
docx_files <- list.files(path = folder_path, pattern = "\.docx$", full.names = TRUE)
# Create an empty data frame to store the results
results <- data.frame(file = character(0), sentiment = numeric(0))
# Loop over the list of files
for (file in docx_files) {
# Read the docx file
sample_data <- read_docx(file)
# Extract the content and create a summary
content <- docx_summary(sample_data)
law <- content[sapply(strsplit(as.character(content$text),""),length)>5,]
# Calculate the sentiment of the summary (or in your case extraction)
sentiment <- sentiment_by(as.character(law$text))
# Add a row to the data frame with the results for this file
results <- rbind(results, data.frame(file = file, sentiment = sentiment$ave_sentiment))
}
# View the results data frame
View(results)

我希望这足够接近您的问题来解决它

最新更新