场景:
如果我有这个表,我们称之为df
:
survey_answer_1__1 | survey_nswer_1__2 | surface_answer_1___3 | >surve_answer2___1 | 1survey_answer_2___2 |
---|---|---|---|---|
1 | 1 | 0 | <1>0||
0 | 1 | 0 | ||
0 | 0 | 1 | 0 | |
1 | 1 | <1>0 | 0
这里是一个R示例,其中新列可以是任意值
df <- as.data.frame(matrix(c(1,0,0,1,1,1,0,1,0,0,0,1,1,0,1,0,0,0,0,0), 4, 5, dim=list(
1:4, paste0("survey_answer_", c(1,1,1,2,2), "__", c(1,2,3,1,5)) )))
df
#> survey_answer_1__1 survey_answer_1__2 survey_answer_1__3 survey_answer_2__1
#> 1 1 1 0 1
#> 2 0 1 0 0
#> 3 0 0 0 1
#> 4 1 1 1 0
#> survey_answer_2__5
#> 1 0
#> 2 0
#> 3 0
#> 4 0
var <- Map(c, names(df), strsplit(names(df), "__"))
result <- tapply(var, sapply(var,"[", 2), (x)
setNames(colSums(df[sapply(x,"[",1)]) , sapply(x,"[",3)))
#to assign the resuilt list to new datafrae variables:
list2env(result, environment())
survey_answer_1
#> 1 2 3
#> 2 3 1
survey_answer_2
#> 1 5
#> 2 0
使用R/tidyverse,首先对dplyr::summarize()
所有列求和;则CCD_ 3;然后CCD_ 4被CCD_;则CCD_ 6在结果列表上删除所有-NA
列:
library(dplyr)
library(tidyr)
library(purrr)
survey_dfs <- df %>%
summarize(across(everything(), sum)) %>%
pivot_longer(
everything(),
names_to = c("survey_answer", ".value"),
names_sep = "___"
) %>%
split(.$survey_answer, drop = TRUE) %>%
map((d) select(d, where((col) !all(is.na(col))) & !survey_answer))
survey_dfs
$survey_answer_1
# A tibble: 1 × 3
`1` `2` `3`
<dbl> <dbl> <dbl>
1 2 3 1
$survey_answer_2
# A tibble: 1 × 2
`1` `2`
<dbl> <dbl>
1 2 0
这将为您提供一个命名的数据帧列表,在大多数情况下这是最佳实践。如果您真的想在全局环境中释放生成的数据帧,那么可以在purrr::iwalk()
:中用assign()
调用替换map()
调用
df %>%
summarize(across(everything(), sum)) %>%
pivot_longer(
everything(),
names_to = c("survey_answer", ".value"),
names_sep = "___"
) %>%
split(.$survey_answer, drop = TRUE) %>%
iwalk((d, dname) {
d <- select(d, where((col) !all(is.na(col))) & !survey_answer)
assign(dname, d, pos = 1)
})
survey_answer_1
# A tibble: 1 × 3
`1` `2` `3`
<dbl> <dbl> <dbl>
1 2 3 1
对于答案1,您可以执行以下操作:
# grab correct columns
df_answer_1 = df[[col for col in df.columns if col.startswith('survey_answer_1')]]
# change column names
df_answer_1.columns = [col[-1] for col in df_answer_1.columns]
# sum up columns
answer_1_sums = df_answer_1.sum()
你可以对答案2做同样的事情。
假设数据在csv
:中
survey_answer_1___1,survey_answer_1___2,survey_answer_1___3,survey_answer_2___1,survey_answer_2___2
1,1,0,1,0
0,1,0,0,0
0,0,0,1,0
1,1,1,0,0
读取数据:
import csv
with open('input.csv') as csvfile:
reader = csv.DictReader(csvfile)
df = [row for row in reader]
过程数据:
from collections import defaultdict, Counter
dd = defaultdict(Counter)
for row in df:
for k, v in row.items():
key1, key2 = k.split('___')
dd[key1][int(key2)] += int(v)
打印结果:
for k in dd:
print(k, sorted(dd[k].items()))
在Python中:
# raw data
df = {"survey_answer_1___1":[1,0,0,1], "survey_answer_1___2":[1,1,0,1], "survey_answer_1___3":[0,0,0,1], "survey_answer_2___1":[1,0,1,0], "survey_answer_2___2":[0,0,0,0]}
# sum up the answers
for k in df:
sum_df[k] = sum(df[k])
# extract answer_1
survey_answer_1 = {[k[-1]:sum_df[k] for k in sum_df if k.startswith("survey_answer_1")]}
survey_answer_1
{'1': 2, '2': 3, '3': 1}
# extract answer_2
survey_answer_2 = {k[-1]:sum_df[k] for k in sum_df if k.startswith("survey_answer_2")}
survey_answer_2
{'1': 2, '2': 0}