我正在努力与R和表的一些初学者问题。我的大部分数据可视化时间都花在Tableau上,但我希望能够在R中复制工作,以利用RMarkdown和StatCanR库的报告生成能力,使我能够从加拿大统计局的CANSIM/CODR表中提取数据。我的编程经验是沿着C, c++, Java, Javascript和Python的线,除了Python,其他都是在世纪之交的大学里学的。
我正在提取某些类型犯罪的比率,并创建了以下表格。
```# A tibble: 4 × 11
Violations `2011` `2012` `2013` `2014` `2015` `2016` `2017` `2018` `2019` `2020`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Total, all Criminal Code violati… 5780. 5638. 5206. 5061. 5232. 5297. 5375. 5513. 5878. 5301.
2 Total violent Criminal Code viol… 1236. 1199. 1096. 1044. 1070. 1076. 1113. 1152. 1279. 1254.
3 Total property crime violations … 3536. 3438. 3154. 3100. 3231. 3239. 3265. 3348. 3512. 3071.
4 Total drug violations [401] 330. 317. 311. 295. 280. 267. 254. 229. 186. 176.
我已经过滤掉了超过十年的数据,并且只针对某些犯罪。
# Pivot the data
table_01 <- pivot_wider(table_01 %>%select("REF_DATE","Violations","VALUE"),names_from=REF_DATE, values_from=VALUE)
table01a<-table_01 %>%select(2020,2019,2011)
)
mutate(
ten_year_change = 2020-2011,
one_year_change = 2020-2019
)
我一直在摆弄不同的库,包括tidyverse和dplyr。我希望代码计算最近两年之间的差异和最近一年和(最近一年- 10年前)之间的差异。这个想法是在加拿大统计局更新他们的数据时生成一个新的报告。
上面的代码绝对不是我想要的。我确实希望我计算差异的年份不要硬编码,这样我就不必在六个月内编辑代码。
我怀疑我没有领会R的做事方式,但如果我能在正确的方向上得到推动,我会很感激的。
下面是TLDR完整的RMarkdown脚本:
---
title: "CJS Statistical Summary"
output: word_document
date: '2021-10-05'
---
` ` `{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
#load libraries
#install.packages("tidyverse")
#install.packages("statcanR")
#install.packages("flextable")
#install.packages("dplyr")
library("tidyverse")
library("statcanR")
library("flextable")
library("dplyr")
setwd("~/R_Scripts") # change for a Windows-style path if ran in Windows.
#set language
language <-"eng"
# Load dataset Incident-based crime statistics, by detailed violations
CODR_0177 <- statcan_data('35-10-0177-01', language)
# Code not written for these CODR tables
#CODR_0027 <- statcan_data('35-10-0027-01', language)
#CODR_0038 <- statcan_data('35-10-0038-01', language)
#CODR_0029 <- statcan_data('35-10-0029-01', language)
#CODR_0022 <- statcan_data('35-10-0022-01', language)
#CODR_0006 <- statcan_data('35-10-0006-01', language)
` ` `
## Table 1
` ` `{r table_01, echo=FALSE}
# Develop table 1 - Crime Stats
# =============================
# Find most recent ten years
years <- distinct(CODR_0177 %>% select("REF_DATE"))
years <- arrange(years,desc(REF_DATE))%>% slice(1:10)
# Copying the crime stats table so it isn't altered in case we need to reuse it.
table_01 <- CODR_0177
# Remove unused columns
table_01 <- table_01 %>% select("REF_DATE","GEO","Violations","Statistics","UOM","VALUE") %>% filter(REF_DATE %in% years$REF_DATE)
# Keep only national data
table_01 <- table_01 %>% filter(GEO == "Canada")
# Keep only crime rate
table_01 <- table_01 %>% filter(Statistics == "Rate per 100,000 population")
# Keep only certain Violations
display_violations <- c("Total, all Criminal Code violations (excluding traffic) [50]","Total violent Criminal Code violations [100]","Total property crime violations [200]","Total drug violations [401]" )
table_01 <- table_01 %>% filter(Violations %in% display_violations)
# Pivot the data
table_01 <- pivot_wider(table_01 %>%select("REF_DATE","Violations","VALUE"),names_from=REF_DATE, values_from=VALUE)
#calculating year to year differences
table01a<-table_01 %>%select(2020,2019,2011)
)
mutate(
ten_year_change = 2020-2011,
one_year_change = 2020-2019
)
# Edit look and feel for report using Flextable
flex_table_01<-flextable(table_01)
flex_table_01<-theme_vanilla(flex_table_01)
flex_table_01<-add_header_row(
flex_table_01,
values=c("","Rates per 100,000 population","% change"),
colwidths=c(1,10,2)
)
flex_table_01<-add_header_row(
flex_table_01,
values=c("Incidents Reported to Police (Crime Rate)"),
colwidths=c(13)
)
flex_table_01 <- align(flex_table_01, i = 1, part = "header", align = "center")
flex_table_01 <- fontsize(flex_table_01, i = NULL, j = NULL, size = 8, part = "all")
flex_table_01 <- colformat_double(flex_table_01, big.mark=",", digits = 0, na_str = "N/A")
flex_table_01
#remove temporary files
rm(years)
rm(display_violations)
rm(table_01)
使用"long"格式。下面是一个假数据的例子。我们使用lag
函数来获得不同时间范围内的变化。一旦在不同的时间尺度上添加了更改,就可以根据需要对数据进行子集和重塑,以创建最终的表。
library(tidyverse)
# Fake data
set.seed(2)
d = tibble(
REF_DATE = rep(2010:2020, each=4),
Violations = rep(LETTERS[1:4], 11),
value = sample(100:200, 44)
)
d
#> # A tibble: 44 × 3
#> REF_DATE Violations value
#> <int> <chr> <int>
#> 1 2010 A 184
#> 2 2010 B 178
#> 3 2010 C 169
#> 4 2010 D 105
#> 5 2011 A 131
#> 6 2011 B 107
#> 7 2011 C 116
#> 8 2011 D 192
#> 9 2012 A 180
#> 10 2012 B 175
#> # … with 34 more rows
d1 = d %>%
arrange(Violations, REF_DATE) %>%
group_by(Violations) %>%
mutate(lag1 = value - lag(value),
lag10 = value - lag(value, n=10))
print(d1, n=23)
#> # A tibble: 44 × 5
#> # Groups: Violations [4]
#> REF_DATE Violations value lag1 lag10
#> <int> <chr> <int> <int> <int>
#> 1 2010 A 184 NA NA
#> 2 2011 A 131 -53 NA
#> 3 2012 A 180 49 NA
#> 4 2013 A 174 -6 NA
#> 5 2014 A 189 15 NA
#> 6 2015 A 132 -57 NA
#> 7 2016 A 139 7 NA
#> 8 2017 A 108 -31 NA
#> 9 2018 A 101 -7 NA
#> 10 2019 A 147 46 NA
#> 11 2020 A 193 46 9
#> 12 2010 B 178 NA NA
#> 13 2011 B 107 -71 NA
#> 14 2012 B 175 68 NA
#> 15 2013 B 164 -11 NA
#> 16 2014 B 154 -10 NA
#> 17 2015 B 153 -1 NA
#> 18 2016 B 115 -38 NA
#> 19 2017 B 171 56 NA
#> 20 2018 B 166 -5 NA
#> 21 2019 B 190 24 NA
#> 22 2020 B 117 -73 -61
#> 23 2010 C 169 NA NA
#> # … with 21 more rows
我们也可以一次执行多个滞后:
d2 = d %>%
arrange(Violations, REF_DATE) %>%
group_by(Violations) %>%
mutate(map_dfc(1:10 %>% set_names(paste0("lag.", .)),
~ value - lag(value, n=.x)))
d2
#> # A tibble: 44 × 13
#> # Groups: Violations [4]
#> REF_DATE Violations value lag.1 lag.2 lag.3 lag.4 lag.5 lag.6 lag.7 lag.8
#> <int> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 2010 A 184 NA NA NA NA NA NA NA NA
#> 2 2011 A 131 -53 NA NA NA NA NA NA NA
#> 3 2012 A 180 49 -4 NA NA NA NA NA NA
#> 4 2013 A 174 -6 43 -10 NA NA NA NA NA
#> 5 2014 A 189 15 9 58 5 NA NA NA NA
#> 6 2015 A 132 -57 -42 -48 1 -52 NA NA NA
#> 7 2016 A 139 7 -50 -35 -41 8 -45 NA NA
#> 8 2017 A 108 -31 -24 -81 -66 -72 -23 -76 NA
#> 9 2018 A 101 -7 -38 -31 -88 -73 -79 -30 -83
#> 10 2019 A 147 46 39 8 15 -42 -27 -33 16
#> # … with 34 more rows, and 2 more variables: lag.9 <int>, lag.10 <int>
由reprex包(v2.0.1)在2018-10-05创建