我正试图在Linux集群上用R运行以下代码。我想要以使用全部处理能力(指定核心/节点/内存)。代码下面基本上运行基于GAM回归的预测,并保存结果作为多组数据的CSV文件(这里我只显示两个)。
我可以通过Rmpi/snow/doParallel这样的并行包传递这个代码吗?非常感谢。
数据
dput(head(cont))
structure(list(citycode = c(101L, 101L, 101L, 101L, 101L, 101L
), year = c(1970L, 1970L, 1970L, 1970L, 1970L, 1970L), week = 1:6,
wk = 1:6, month = c(1L, 1L, 1L, 1L, 2L, 2L), day = c(10L,
17L, 24L, 31L, 7L, 14L), countall = c(17L, 25L, 32L, 25L,
23L, 25L), city = structure(c(10L, 10L, 10L, 10L, 10L, 10L
), .Label = c("AKRON", "ALBANY", "ALBUQUERQUE", "ALLENTOWN",
"ATLANTA", "AUSTIN", "BALTIMORE", "BERKELEY", "BIRMINGHAM",
"BOSTON", "BRIDGEPORT", "BUFFALO", "CAMBRIDGE", "CAMDEN",
"CANTON", "CHARLOTTE", "CHATTANOOGA", "CHICAGO", "CINCINNATI",
"CLEVELAND", "COLUMBUS", "DALLAS", "DAYTON", "DENVER", "DETROIT",
"DULUTH", "ELIZABETH", "ERIE", "EVANSVILLE", "FLINT", "FRESNO",
"GARY", "GLENDALE", "HARTFORD", "HONOLULU", "HOUSTON", "INDIANAPOLIS",
"JACKSONVILLE", "KNOXVILLE", "LINCOLN", "LOUISVILLE", "LOWELL",
"LYNN", "MADISON", "MEMPHIS", "MIAMI", "MILWAUKEE", "MINNEAPOLIS",
"MOBILE", "MONTGOMERY", "NASHVILLE", "NEWARK", "NORFOLK",
"OAKLAND", "OGDEN", "OMAHA", "PASADENA", "PATERSON", "PEORIA",
"PHILADELPHIA", "PHOENIX", "PITTSBURG", "PORTLAND", "PROVIDENCE",
"PUEBLO", "READING", "RICHMOND", "ROCHESTER", "ROCKFORD",
"SACRAMENTO", "SAVANNAH", "SCHENECTADY", "SCRANTON", "SEATTLE",
"SHREVEPORT", "SOMERVILLE", "SPOKANE", "SPRINGFIELD", "SYRACUSE",
"TACOMA", "TAMPA", "TOLEDO", "TRENTON", "TUCSON", "TULSA",
"UTICA", "WASHINGTON", "WATERBURY", "WICHITA", "WILMINGTON",
"WORCESTER", "YONKERS", "YOUNGSTOWN"), class = "factor"),
tmax = c(-4.92166666666663, -5.63428571428569, -2.52714285714284,
-4.97571428571427, 0.498571428571446, 1.29857142857147),
tmin = c(-13.8816666666667, -13.8142857142857, -12.9971428571429,
-14.36, -9.03142857142855, -9.3342857142857), tmean = c(-9.57770833333332,
-9.87196428571428, -8.20696428571427, -8.79892857142856,
-3.58249999999998, -4.0480357142857), hmax = c(0.0021666666666667,
0.0021428571428571, 0.0028571428571429, 0.0025714285714286,
0.0044285714285714, 0.0047142857142857), hmin = c(0.0011666666666667,
0.001, 0.001, 0.0007142857142857, 0.0012857142857143, 0.0017142857142857
), hmean = c(0.001625, 0.0014464285714286, 0.0017678571428571,
0.0018214285714286, 0.0028928571428571, 0.0029107142857143
), population = c(3900000L, 3900000L, 3900000L, 3900000L,
3900000L, 3900000L), lncount = c(2.83321334405622, 3.2188758248682,
3.46573590279973, 3.2188758248682, 3.13549421592915, 3.2188758248682
), lnpop = c(15.176487, 15.176487, 15.176487, 15.176487,
15.176487, 15.176487), count_pop = c(4.35897435897e-06, 6.41025641026e-06,
8.20512820513e-06, 6.41025641026e-06, 5.89743589744e-06,
6.41025641026e-06), lncount_pop = c(-12.3432737670437, -11.9576112862317,
-11.7107512083001, -11.9576112862317, -12.0409928951707,
-11.9576112862317), uniqueid = c(43L, 43L, 43L, 43L, 43L,
43L), msax = structure(c(9L, 9L, 9L, 9L, 9L, 9L), .Label = c("Akron, OH",
"Albany-Schenectady-Troy, NY", "Albuquerque, NM", "Allentown-Bethlehem-Easton, PA-NJ",
"Atlanta-Sandy Springs-Roswell, GA", "Austin-Round Rock, TX",
"Baltimore-Columbia-Towson, MD", "Birmingham-Hoover, AL",
"Boston-Cambridge-Newton, MA-NH", "Bridgeport-Stamford-Norwalk, CT",
"Buffalo-Cheektowaga-Niagara Falls, NY", "Canton-Massillon, OH",
"Charlotte-Concord-Gastonia, NC-SC", "Chattanooga, TN-GA",
"Chicago-Naperville-Elgin, IL-IN-WI", "Cincinnati, OH-KY-IN",
"Cleveland-Elyria, OH", "Columbus, OH", "Dallas-Fort Worth-Arlington, TX",
"Dayton, OH", "Denver-Aurora-Lakewood, CO", "Detroit-Warren-Dearborn, MI",
"Duluth, MN-WI", "Erie, PA", "Evansville, IN-KY", "Flint, MI",
"Fresno, CA", "Hartford-West Hartford-East Hartford, CT",
"Houston-The Woodlands-Sugar Land, TX", "Indianapolis-Carmel-Anderson, IN",
"Jacksonville, FL", "Knoxville, TN", "Lincoln, NE", "Los Angeles-Long Beach-Anaheim, CA",
"Louisville/Jefferson County, KY-IN", "Madison, WI", "Memphis, TN-MS-AR",
"Miami-Fort Lauderdale-West Palm Beach, FL", "Milwaukee-Waukesha-West Allis, WI",
"Minneapolis-St. Paul-Bloomington, MN-WI", "Mobile, AL",
"Montgomery, AL", "Nashville-Davidson--Murfreesboro--Franklin, TN",
"New Haven-Milford, CT", "New York-Newark-Jersey City, NY-NJ-PA",
"Ogden-Clearfield, UT", "Omaha-Council Bluffs, NE-IA", "Peoria, IL",
"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD", "Phoenix-Mesa-Scottsdale, AZ",
"Pittsburgh, PA", "Portland-Vancouver-Hillsboro, OR-WA",
"Providence-Warwick, RI-MA", "Pueblo, CO", "Reading, PA",
"Richmond, VA", "Rochester, NY", "Rockford, IL", "Sacramento--Roseville--Arden-Arcade, CA",
"San Francisco-Oakland-Hayward, CA", "Savannah, GA", "Scranton--Wilkes-Barre--Hazleton, PA",
"Seattle-Tacoma-Bellevue, WA", "Shreveport-Bossier City, LA",
"Spokane-Spokane Valley, WA", "Springfield, MA", "Syracuse, NY",
"Tampa-St. Petersburg-Clearwater, FL", "Toledo, OH", "Trenton, NJ",
"Tucson, AZ", "Tulsa, OK", "Urban Honolulu, HI", "Utica-Rome, NY",
"Virginia Beach-Norfolk-Newport News, VA-NC", "Washington-Arlington-Alexandria, DC-VA-MD-WV",
"Wichita, KS", "Worcester, MA-CT", "Youngstown-Warren-Boardman, OH-PA"
), class = "factor"), income = c(4686L, 4686L, 4686L, 4686L,
4686L, 4686L), deflator = c(22.805, 22.805, 22.805, 22.805,
22.805, 22.805), real_inc = c(20548.131, 20548.131, 20548.131,
20548.131, 20548.131, 20548.131), lnincome = c(9.9305248,
9.9305248, 9.9305248, 9.9305248, 9.9305248, 9.9305248), latitude = c(42.358433,
42.358433, 42.358433, 42.358433, 42.358433, 42.358433), longitude = c(-71.059776,
-71.059776, -71.059776, -71.059776, -71.059776, -71.059776
), tmax_tmean = c(0.51386683488134, 0.570736030967926, 0.307926630257402,
0.565490928278604, -0.139168577410035, -0.320790506859599
), tmin_tmean = c(1.44937245774694, 1.39934518748982, 1.58367240366414,
1.63201688517271, 2.52098494666535, 2.30588027703031), hmax_hmean = c(1.33333333333334,
1.48148148148148, 1.61616161616162, 1.41176470588235, 1.53086419753087,
1.61963190184049), hmin_hmean = c(0.71794871794872, 0.691358024691359,
0.565656565656567, 0.392156862745098, 0.444444444444446,
0.588957055214722), yearwk = 197001:197006, l1tmax = c(NA,
-4.9216666, -5.6342859, -2.5271428, -4.9757142, 0.49857143
), l2tmax = c(NA, NA, -4.9216666, -5.6342859, -2.5271428,
-4.9757142), l3tmax = c(NA, NA, NA, -4.9216666, -5.6342859,
-2.5271428), l1tmin = c(NA, -13.881667, -13.814285, -12.997143,
-14.36, -9.0314283), l2tmin = c(NA, NA, -13.881667, -13.814285,
-12.997143, -14.36), l3tmin = c(NA, NA, NA, -13.881667, -13.814285,
-12.997143), l1tmean = c(NA, -9.5777082, -9.8719645, -8.2069645,
-8.7989283, -3.5825), l2tmean = c(NA, NA, -9.5777082, -9.8719645,
-8.2069645, -8.7989283), l3tmean = c(NA, NA, NA, -9.5777082,
-9.8719645, -8.2069645), l1hmax = c(NA, 0.0021666666, 0.0021428571,
0.0028571428, 0.0025714287, 0.0044285716), l2hmax = c(NA,
NA, 0.0021666666, 0.0021428571, 0.0028571428, 0.0025714287
), l3hmax = c(NA, NA, NA, 0.0021666666, 0.0021428571, 0.0028571428
), l1hmin = c(NA, 0.0011666666, 0.001, 0.001, 0.00071428571,
0.0012857143), l2hmin = c(NA, NA, 0.0011666666, 0.001, 0.001,
0.00071428571), l3hmin = c(NA, NA, NA, 0.0011666666, 0.001,
0.001), l1hmean = c(NA, 0.001625, 0.0014464286, 0.0017678571,
0.0018214285, 0.0028928572), l2hmean = c(NA, NA, 0.001625,
0.0014464286, 0.0017678571, 0.0018214285), l3hmean = c(NA,
NA, NA, 0.001625, 0.0014464286, 0.0017678571), l1count_pop = c(NA,
4.3589744e-06, 6.4102564e-06, 8.2051283e-06, 6.4102564e-06,
5.897436e-06), l2count_pop = c(NA, NA, 4.3589744e-06, 6.4102564e-06,
8.2051283e-06, 6.4102564e-06), l3count_pop = c(NA, NA, NA,
4.3589744e-06, 6.4102564e-06, 8.2051283e-06), l4tmax = c(NA,
NA, NA, NA, -4.9216666, -5.6342859), l4tmin = c(NA, NA, NA,
NA, -13.881667, -13.814285), l4tmean = c(NA, NA, NA, NA,
-9.5777082, -9.8719645), l4hmax = c(NA, NA, NA, NA, 0.0021666666,
0.0021428571), l4hmean = c(NA, NA, NA, NA, 0.001625, 0.0014464286
), l4hmin = c(NA, NA, NA, NA, 0.0011666666, 0.001)), .Names = c("citycode",
"year", "week", "wk", "month", "day", "countall", "city", "tmax",
"tmin", "tmean", "hmax", "hmin", "hmean", "population", "lncount",
"lnpop", "count_pop", "lncount_pop", "uniqueid", "msax", "income",
"deflator", "real_inc", "lnincome", "latitude", "longitude",
"tmax_tmean", "tmin_tmean", "hmax_hmean", "hmin_hmean", "yearwk",
"l1tmax", "l2tmax", "l3tmax", "l1tmin", "l2tmin", "l3tmin", "l1tmean",
"l2tmean", "l3tmean", "l1hmax", "l2hmax", "l3hmax", "l1hmin",
"l2hmin", "l3hmin", "l1hmean", "l2hmean", "l3hmean", "l1count_pop",
"l2count_pop", "l3count_pop", "l4tmax", "l4tmin", "l4tmean",
"l4hmax", "l4hmean", "l4hmin"), .internal.selfref = <pointer: (nil)>, row.names = c(NA,
6L), class = c("data.table", "data.frame"))
#####################
library(data.table)
library(mgcv)
library(reshape2)
library(dplyr)
library(tidyr)
library(lubridate)
library(DataCombine)
#
gam_max_count_wk <- gam(count_pop ~ factor(citycode) + factor(year) +
factor(week) + s(lnincome) + s(tmax) +
s(hmax),data=cont,na.action="na.omit", method="ML")
#
# Historic
temp_hist <- read.csv("/work/sd00815/giss_historic/giss_temp_hist.csv")
humid_hist <- read.csv("/work/sd00815/giss_historic/giss_hum_hist.csv")
#
temp_hist <- as.data.table(temp_hist)
humid_hist <- as.data.table(humid_hist)
#
# Merge
mykey<- c("FIPS", "year","month", "week")
setkeyv(temp_hist, mykey)
setkeyv(humid_hist, mykey)
#
hist<- merge(temp_hist, humid_hist, by=mykey)
#
hist$X.x <- NULL
hist$X.y <- NULL
#
# Max
hist_max <- hist
hist_max$FIPS <- hist_max$year <- hist_max$month <- hist_max$tmin <-
hist_max$tmean <- hist_max$hmin <- hist_max$hmean <- NULL
#
# Adding Factors
hist_max$citycode <- rep(101,nrow(hist_max))
hist_max$year <- rep(2010,nrow(hist_max))
hist_max$lnincome <- rep(10.262,nrow(hist_max))
#
# Predictions
pred_hist_max <- predict.gam(gam_max_count_wk,hist_max)
#
pred_hist_max <- as.data.table(pred_hist_max)
pred_hist_max <- cbind(hist, pred_hist_max)
pred_hist_max$tmax <- pred_hist_max$tmean <- pred_hist_max$tmin <-
pred_hist_max$hmean <- pred_hist_max$hmax <- pred_hist_max$hmin <- NULL
#
# Aggregate by FIPS
max_hist <- pred_hist_max %>%
group_by(FIPS) %>%
summarise(pred_hist = mean(pred_hist_max))
#
### Future
## 4.5
# 4.5_2021_2050
temp_sim <-
read.csv("/work/sd00815/giss_future/giss_4.5_2021_2050_temp.csv")
humid_sim <-
read.csv("/work/sd00815/giss_future/giss_4.5_2021_2050_temp.csv")
#
# Max
temp_sim <- as.data.table(temp_sim)
setnames(temp_sim, "max", "tmax")
setnames(temp_sim, "min", "tmin")
setnames(temp_sim, "avg", "tmean")
#
humid_sim <- as.data.table(humid_sim)
setnames(humid_sim, "max", "hmax")
setnames(humid_sim, "min", "hmin")
setnames(humid_sim, "avg", "hmean")
#
temp_sim$X <- NULL
humid_sim$X <- NULL
#
# Merge
mykey<- c("FIPS", "year","month", "week")
setkeyv(temp_sim, mykey)
setkeyv(humid_sim, mykey)
#
sim <- merge(temp_sim, humid_sim, by=mykey)
#
sim_max <- sim
#
sim_max$FIPS <- sim_max$year <- sim_max$month <- sim_max$tmin <-
sim_max$tmean <- sim_max$hmin <- sim_max$hmean <- NULL
#
# Adding Factors
sim_max$citycode <- rep(101,nrow(sim_max))
sim_max$year <- rep(2010,nrow(sim_max))
sim_max$week <- rep(1,nrow(sim_max))
sim_max$lnincome <- rep(10.262,nrow(sim_max))
#
# Predictions
pred_sim_max <- predict.gam(gam_max_count_wk,sim_max)
#
pred_sim_max <- as.data.table(pred_sim_max)
pred_sim_max <- cbind(sim, pred_sim_max)
pred_sim_max$tmax <- pred_sim_max$tmean <- pred_sim_max$tmin <-
pred_sim_max$hmean <- pred_sim_max$hmax <- pred_sim_max$hmin <- NULL
#
# Aggregate by FIPS
max_sim <- pred_sim_max %>%
group_by(FIPS) %>%
summarise(pred_sim = mean(pred_sim_max))
#
# Merge with Historical Data
max_hist$FIPS <- as.factor(max_hist$FIPS)
max_sim$FIPS <- as.factor(max_sim$FIPS)
#
mykey1<- c("FIPS")
setkeyv(max_hist, mykey1)
setkeyv(max_sim, mykey1)
max_change <- merge(max_hist, max_sim, by=mykey1)
max_change$change <-
((max_change$pred_sim-max_change$pred_hist)/max_change$pred_hist)*100
#
write.csv(max_change, file =
"/work/sd00815/projections_data/year_wk_fe/giss/max/giss_4.5_2021_2050.csv")
# 4.5_2081_2100
temp_sim <-
read.csv("/work/sd00815/giss_future/giss_4.5_2081_2100_temp.csv")
humid_sim <-
read.csv("/work/sd00815/giss_future/giss_4.5_2081_2100_temp.csv")
#
# Max
temp_sim <- as.data.table(temp_sim)
setnames(temp_sim, "max", "tmax")
setnames(temp_sim, "min", "tmin")
setnames(temp_sim, "avg", "tmean")
#
humid_sim <- as.data.table(humid_sim)
setnames(humid_sim, "max", "hmax")
setnames(humid_sim, "min", "hmin")
setnames(humid_sim, "avg", "hmean")
#
temp_sim$X <- NULL
humid_sim$X <- NULL
#
# Merge
mykey<- c("FIPS", "year","month", "week")
setkeyv(temp_sim, mykey)
setkeyv(humid_sim, mykey)
#
sim <- merge(temp_sim, humid_sim, by=mykey)
#
sim_max <- sim
#
sim_max$FIPS <- sim_max$year <- sim_max$month <- sim_max$tmin <-
sim_max$tmean <- sim_max$hmin <- sim_max$hmean <- NULL
#
# Adding Factors
sim_max$citycode <- rep(101,nrow(sim_max))
sim_max$year <- rep(2010,nrow(sim_max))
sim_max$week <- rep(1,nrow(sim_max))
sim_max$lnincome <- rep(10.262,nrow(sim_max))
#
# Predictions
pred_sim_max <- predict.gam(gam_max_count_wk,sim_max)
#
pred_sim_max <- as.data.table(pred_sim_max)
pred_sim_max <- cbind(sim, pred_sim_max)
pred_sim_max$tmax <- pred_sim_max$tmean <- pred_sim_max$tmin <-
pred_sim_max$hmean <- pred_sim_max$hmax <- pred_sim_max$hmin <- NULL
#
# Aggregate by FIPS
max_sim <- pred_sim_max %>%
group_by(FIPS) %>%
summarise(pred_sim = mean(pred_sim_max))
#
# Merge with Historical Data
max_hist$FIPS <- as.factor(max_hist$FIPS)
max_sim$FIPS <- as.factor(max_sim$FIPS)
#
mykey1<- c("FIPS")
setkeyv(max_hist, mykey1)
setkeyv(max_sim, mykey1)
max_change <- merge(max_hist, max_sim, by=mykey1)
max_change$change <-
((max_change$pred_sim-max_change$pred_hist)/max_change$pred_hist)*100
#
write.csv(max_change, file =
"/work/sd00815/projections_data/year_wk_fe/giss/max/giss_4.5_2081_2100.csv")
####################
您知道您的示例在gam
调用后立即在read.csv
上死亡吗?
无论如何,我很快修改并测量了我自己的GAM代码,并可以告诉你使用与单线程标准R相比,带有MKL库的MicrosoftR在GAM调用中为我提供了2到3倍的加速。
从这里安装它,与MKL一起,并在代码集的顶部安装实际内核数(而不是HT内核数)
if (require("RevoUtilsMath")) {
setMKLthreads(4)
}
检查的进展情况
更新
可以肯定的是,它将加速与线性代数相关的东西,当然,这里没有问题。但看看你的代码,我会说它可能主要由I/O、所有的read.csv
和write.csv
调用控制,总的加速可能很小。移动到未格式化的数据可能会节省更多时间。。。