我有这个巨大的循环,我确信它可以被优化,但它就是这样工作的。问题是,在45,000行数据集上计算大约需要8天。所以我尝试使用foreach。一开始我很高兴,因为当我运行它时,所有的内核都被使用了,然而,没有一个计算的变量或循环内使用的迭代器被实际计算(我看不到迭代器"row", prov_list(在第四行),…(或环境中的任何其他对象)。
我试图包含。export = "row",但它没有任何区别。有趣的是,循环实际上需要时间来计算,并且在此过程中CPU的使用率会上升到100%。
如果有人能给点提示,我会很感激的。foreach(row = 1:50, .packages = c("dplyr", "hablar", "foreach")) %dopar% { # 1:nrow(data_QI)
# Prepare data to be used in the loop
obs <- data_QI[row,]
prov_list <- colnames(GTC_matrix[-1])
if (!is.na(obs$EXPIRATIONYEAR)) {
tm_years <- seq(obs$APPLICATIONYEAR, obs$EXPIRATIONYEAR)
# articles_provinces file only contains info until 1920, so we remove years higher than 1920
# only for searching in this file purposes (it is not changed in original data)
if (tm_years[length(tm_years)] >= 1920) {
tm_years <- tm_years[!(tm_years > 1920)]
}
} else {
obs$EXPIRATIONYEAR <- 1920
tm_years <- seq(obs$APPLICATIONYEAR, obs$EXPIRATIONYEAR)
}
tm_ap_appprov <- obs$APPPROV
tm_ap_appcoun <- obs$APPCOUN
# Extract all provinces except that of the observation at hand
tm_ap_appprov_not <- prov_list[-match(tm_ap_appprov, prov_list)]
tm_industries <- select(obs, starts_with("INDUSTRY_")) %>%
select_if(function(col) all(col > 0)) %>%
names()
# Initialize objects used in the loop
numart_ynp <- data.frame() # numerator of the numerator
numart_yp <- data.frame() # denominator of the numerator
numart_yn <- data.frame() # numerator of the denominator
numart_y <- data.frame() # denominator of the denominator
numart_ync <- data.frame() # numerator of the numerator
numart_yc <- data.frame() # denominator of the numerator
numart_yn_2 <- data.frame() # numerator of the denominator
numart_y_2 <- data.frame() # denominator of the denominator
numart_ynp_exp <- list() # numerator of the numerator
numart_yp_exp <- list() # denominator of the numerator
numart_yn_exp <- list() # numerator of the denominator
numart_y_exp <- list() # denominator of the denominator
RTMGS_noprov <- list()
RTMGD_noprov <- list()
data_QI$RTMGS_EXP <- 0
data_QI$RTMGD_EXP <- 0
# Get into the loop for indices calculation
if (data_QI$APPCOUN[row] == "SPAIN") {
print(paste(row, "- entra en == SPAIN"))
GTC_vector <- select(GTC_matrix, tm_ap_appprov)
for (y in tm_years) {
for (n in tm_industries) {
# This is the numerator of the numerator
numart_ynp <- bind_rows(numart_ynp, (filter(articles_provinces, Year == y, SECTOR == n) %>%
select(tm_ap_appprov)))
# This is the numerator of the denominator in RTMGS
numart_yn <- bind_rows(numart_yn, (filter(articles_provinces, Year == y, SECTOR == n)))
}
}
for (y in tm_years) { # File only contains info until 1920
# This is the denominator of the numerator in RTMGS
numart_yp <- bind_rows(numart_yp, (filter(articles_provinces, Year == y) %>%
select(tm_ap_appprov)))
# This is the denominator of the denominator in RTMGS
numart_y <- bind_rows(numart_y, (filter(articles_provinces, Year == y)))
# This loop serves also to calculate NUM_NUM and NUM_DEN of RTMGS_EXP and RTMGD_EXP
for (p in tm_ap_appprov_not) {
numart_ynp_exp <- append(numart_ynp_exp, (filter(articles_provinces, Year == y, SECTOR == n) %>%
select(p)))
numart_yp_exp <- append(numart_yp_exp, (filter(articles_provinces, Year == y) %>%
select(p)))
}
}
# El NUM_DEN y DEN_DEN de RTMGS_EXP y RTMGD_EXT solo hay que hacerlo una vez, no por cada p, porque en realidad va a salir la misma lista para cada p. Hay que sacarlos del loop.
numart_yn_exp <- append(numart_yn_exp, (filter(articles_provinces, Year == y, SECTOR == n) %>%
select(-Year, -SECTOR, -tm_ap_appprov)))
numart_y_exp <- append(numart_y_exp, (filter(articles_provinces, Year == y) %>%
select(-Year, -SECTOR, -tm_ap_appprov)))
# Final calculation of RTMGS and RTMGD and RTMGS_EXP and RTMGD_EXP
# RTMGS and RTMGD
data_QI[row,"RTMGS_NUM_NUM"] <- mean(unlist(numart_ynp))
data_QI[row,"RTMGS_DEN_NUM"] <- mean(unlist(numart_yn[3:ncol(numart_yn)]))
data_QI[row,"RTMGS_NUM_DEN"] <- mean(unlist(numart_yp))
data_QI[row,"RTMGS_DEN_DEN"] <- mean(unlist(numart_y[3:ncol(numart_y)]))
data_QI <- data_QI %>%
mutate(RTMGS = (RTMGS_NUM_NUM/RTMGS_DEN_NUM)/(RTMGS_NUM_DEN/RTMGS_DEN_DEN),
RTMGD = (1/(ncol(articles_provinces)-2))*((RTMGS-min(RTMGS, na.rm = T))/(max(RTMGS, na.rm = T)-min(RTMGS, na.rm = T)))
)
# RTMGS_EXP and RTMGD_EXP
# The RTMGS and RTMGD for each region are needed first. Note that the min/max fun is actually min_/max_ from hablar package
for (i in (1:54)) {
RTMGS_noprov[[i]] <- (numart_ynp_exp[[i]]/mean(unlist(numart_yp_exp[[i]])))/(numart_yn_exp[[i]]/mean(unlist(numart_y_exp[[i]]))) # Hay muchos NaN, porque en muchas de las otras provincias que no son la de la marca en cuestion, el numero de articulos es cero. Y cero/cero = NaN
# RTMGD_noprov must include max and min for ALL provinces, not n-1
RTMGS_allprov <- append(RTMGS_noprov, data_QI$RTMGS[row])
RTMGD_noprov[[i]] <- (1/(ncol(articles_provinces)-2))*((RTMGS_allprov[[i]]-min_(unlist(RTMGS_allprov)))/(max_(unlist(RTMGS_allprov))-min_(unlist(RTMGS_allprov))))
}
# Now the Exposure. Note that the max fun is actually max_ from hablar package
data_QI[row, "RTMGS_EXP"] <- max_(unlist(RTMGS_noprov) / GTC_vector)
data_QI[row, "RTMGD_EXP"] <- max_(unlist(RTMGD_noprov) / GTC_vector)
print(paste(row, "ha terminado == SPAIN"))
} else {
# Observations that are not SPAIN (this is the else clause of: (data_QI$APPCOUN[row] == "SPAIN"))
print(paste(row, "- entra en != SPAIN"))
for (y in tm_years) { # File only contains info until 1920
for (n in tm_industries) {
numart_ync <- bind_rows(numart_ync, (filter(articles_countries, Year == y, SECTOR == n) %>%
select(tm_ap_appcoun)))
numart_yn_2 <- bind_rows(numart_yn_2, (filter(articles_countries, Year == y, SECTOR == n) %>%
mutate(FOREINSP = (rowSums(across(where(is.numeric))) - Year) - SPAIN))) # not very elegant subtracting YEAR but it damn works
}
}
for (y in tm_years) { # File only contains info until 1920
numart_yc <- bind_rows(numart_yc, (filter(articles_countries, Year == y) %>%
select(tm_ap_appcoun)))
numart_y_2 <- bind_rows(numart_y_2, (filter(articles_countries, Year == y) %>%
mutate(FOREINSP = (rowSums(across(where(is.numeric))) - Year) - SPAIN))) # not very elegant subtracting YEAR but it damn works
}
# Final calculation of QIX_3 (RTMICS) and QIX_4 (RTMICD)
data_QI[row,"RTMICS_NUM_NUM"] <- mean(unlist(numart_ync), na.rm = T)
data_QI[row,"RTMICS_DEN_NUM"] <- mean(unlist(numart_yc))
data_QI[row,"RTMICS_NUM_DEN"] <- mean(numart_yn_2$FOREINSP)
data_QI[row,"RTMICS_DEN_DEN"] <- mean(numart_y_2$FOREINSP)
data_QI <- data_QI %>% mutate(RTMICS = (RTMICS_NUM_NUM/RTMICS_DEN_NUM)/(RTMICS_NUM_DEN/RTMICS_DEN_DEN),
RTMICD = (1/(ncol(articles_countries)-2))*((RTMICS-min(RTMICS, na.rm = T))/ (max(RTMICS, na.rm = T)-min(RTMICS, na.rm = T))))
print(paste(row,"ha terminado != SPAIN"))
}
}
foreach
循环的每次迭代应该返回一个对象(例如数据帧或矩阵)。然后按照foreach
函数的.combine
参数的规定将它们组合起来。然后,foreach
函数返回组合后的对象。所以你需要这样写:
combined_results <- foreach(i = 1:50, .combine=rbind) %dopar% {
# code that produces some_result
return(some_result)
}