用R规范化数据帧



我有一个名为示例的数据帧,其中出现了一些语义特征:

> str(examples)
Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame':    50 obs. of  12 variables:
$ filename      : chr  "Text01" "Text02" "Text03" "Text04" ...
$ Control       : num  1 3 0 0 0 6 0 1 0 1 ...
$ Economic      : num  1 3 0 0 0 0 1 0 1 2 ...
$ ExternalVoices: num  1 2 1 1 1 2 1 4 0 1 ...
$ JobsSkills    : num  0 0 0 0 0 2 0 3 0 0 ...
$ LegalStatus   : num  0 3 4 0 5 0 1 0 4 0 ...
$ Modals        : num  4 6 1 5 4 4 2 6 2 2 ...
$ Orign         : num  2 6 8 6 3 5 3 3 2 6 ...
$ Sanctions     : num  1 3 0 3 0 3 2 1 1 0 ...
$ Subjectivisms : num  2 3 4 4 3 2 1 1 2 4 ...
$ Verbs         : num  3 7 3 11 6 2 7 7 4 5 ...
$ LineTotal         : num  130 274 258 419 268 210 379 244 172 199 ...
- attr(*, "spec")=
.. cols(
..   filename = col_character(),
..   Control = col_double(),
..   Economic = col_double(),
..   ExternalVoices = col_double(),
..   JobsSkills = col_double(),
..   LegalStatus = col_double(),
..   Modals = col_double(),
..   Orign = col_double(),
..   Sanctions = col_double(),
..   Subjectivisms = col_double(),
..   Verbs = col_double(),
..   LineTotal = col_double()
.. )
head(examples)
A tibble: 6 x 12
filename Control Economic ExternalVoices JobsSkills LegalStatus Modals Orign Sanctions Subjectivisms Verbs LineTotal
<chr>      <dbl>    <dbl>          <dbl>      <dbl>       <dbl>  <dbl> <dbl>     <dbl>         <dbl> <dbl> <dbl>
1 Text01         1        1              1          0           0      4     2         1             2     3   130
2 Text02         3        3              2          0           3      6     6         3             3     7   274
3 Text03         0        0              1          0           4      1     8         0             4     3   258
4 Text04         0        0              1          0           0      5     6         3             4    11   419
5 Text05         0        0              1          0           5      4     3         0             3     6   268
6 Text06         6        0              2          2           0      4     5         3             2     2   210

我需要应用一个公式,将每个单元格的值乘以1000,然后除以行的总和,即递归地除以列"LineTotal":

(cellx1000(/LineTotal

LineTotal每行的变化。

我甚至想不出一种方法来完成它。任何帮助都会很棒!

谢谢!

编辑

为要复制的文件提供dput(df(:

> dput(df)
structure(list(filename = c("Text01", "Text02", "Text03", "Text04", 
"Text05", "Text06", "Text07", "Text08", "Text09", "Text10", "Text11", 
"Text12", "Text13", "Text14", "Text15", "Text16", "Text17", "Text18", 
"Text19", "Text20", "Text21", "Text22", "Text23", "Text24", "Text25", 
"Text26", "Text27", "Text28", "Text29", "Text30", "Text31", "Text32", 
"Text33", "Text34", "Text35", "Text36", "Text37", "Text38", "Text39", 
"Text40", "Text41", "Text42", "Text43", "Text44", "Text45", "Text46", 
"Text47", "Text48", "Text49", "Text50"), Control = c(1, 3, 0, 
0, 0, 6, 0, 1, 0, 1, 1, 4, 0, 2, 0, 3, 3, 1, 1, 0, 0, 2, 1, 5, 
5, 2, 0, 0, 1, 3, 3, 0, 0, 0, 4, 1, 1, 2, 0, 0, 0, 0, 4, 0, 3, 
0, 2, 1, 0, 0), Economic = c(1, 3, 0, 0, 0, 0, 1, 0, 1, 2, 0, 
1, 2, 2, 0, 1, 1, 4, 0, 1, 0, 0, 1, 2, 0, 2, 1, 0, 0, 0, 0, 2, 
1, 3, 7, 1, 2, 3, 0, 4, 0, 1, 0, 2, 0, 0, 0, 0, 0, 2), ExternalVoices = c(1, 
2, 1, 1, 1, 2, 1, 4, 0, 1, 0, 8, 6, 2, 0, 6, 1, 2, 3, 0, 1, 4, 
2, 1, 2, 0, 0, 3, 1, 2, 1, 1, 4, 7, 5, 2, 1, 3, 0, 0, 2, 0, 3, 
0, 4, 1, 1, 2, 0, 1), JobsSkills = c(0, 0, 0, 0, 0, 2, 0, 3, 
0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 
0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
), LegalStatus = c(0, 3, 4, 0, 5, 0, 1, 0, 4, 0, 1, 3, 2, 0, 
0, 3, 0, 0, 1, 1, 0, 1, 0, 2, 3, 0, 3, 0, 2, 0, 2, 2, 12, 2, 
0, 0, 3, 0, 1, 1, 5, 2, 0, 0, 5, 1, 7, 3, 1, 0), Modals = c(4, 
6, 1, 5, 4, 4, 2, 6, 2, 2, 0, 5, 2, 1, 7, 5, 6, 1, 0, 0, 1, 2, 
6, 0, 2, 8, 0, 3, 8, 0, 1, 2, 5, 13, 2, 7, 1, 2, 4, 0, 2, 4, 
5, 8, 5, 0, 2, 7, 1, 3), Orign = c(2, 6, 8, 6, 3, 5, 3, 3, 2, 
6, 1, 8, 2, 7, 8, 8, 12, 7, 6, 2, 3, 5, 5, 2, 2, 4, 2, 1, 7, 
6, 5, 5, 11, 5, 7, 12, 6, 8, 5, 12, 12, 1, 4, 7, 7, 3, 6, 2, 
3, 5), Sanctions = c(1, 3, 0, 3, 0, 3, 2, 1, 1, 0, 0, 3, 1, 2, 
1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 5, 4, 3, 4, 2, 2, 0, 6, 6, 6, 0, 
2, 4, 1, 4, 9, 1, 3, 3, 4, 2, 1, 4, 1, 0, 1), Subjectivisms = c(2, 
3, 4, 4, 3, 2, 1, 1, 2, 4, 0, 5, 4, 2, 2, 2, 2, 5, 4, 0, 0, 4, 
3, 2, 6, 5, 0, 4, 10, 4, 2, 0, 7, 3, 6, 3, 2, 6, 4, 7, 6, 1, 
4, 3, 2, 0, 4, 5, 2, 4), Verbs = c(3, 7, 3, 11, 6, 2, 7, 7, 4, 
5, 1, 9, 6, 7, 10, 6, 11, 7, 7, 2, 2, 8, 5, 8, 7, 8, 2, 6, 6, 
6, 7, 4, 12, 10, 7, 11, 9, 10, 7, 21, 11, 3, 4, 9, 7, 3, 4, 6, 
2, 10), Total = c(130, 274, 258, 419, 268, 210, 379, 244, 172, 
199, 87, 462, 211, 251, 382, 313, 509, 287, 253, 123, 92, 269, 
292, 313, 311, 361, 200, 261, 387, 261, 263, 293, 554, 587, 325, 
562, 434, 315, 521, 660, 661, 202, 204, 297, 549, 161, 368, 288, 
71, 341)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -50L), spec = structure(list(cols = list(
filename = structure(list(), class = c("collector_character", 
"collector")), Control = structure(list(), class = c("collector_double", 
"collector")), Economic = structure(list(), class = c("collector_double", 
"collector")), ExternalVoices = structure(list(), class = c("collector_double", 
"collector")), JobsSkills = structure(list(), class = c("collector_double", 
"collector")), LegalStatus = structure(list(), class = c("collector_double", 
"collector")), Modals = structure(list(), class = c("collector_double", 
"collector")), Orign = structure(list(), class = c("collector_double", 
"collector")), Sanctions = structure(list(), class = c("collector_double", 
"collector")), Subjectivisms = structure(list(), class = c("collector_double", 
"collector")), Verbs = structure(list(), class = c("collector_double", 
"collector")), Total = structure(list(), class = c("collector_double", 
"collector"))), default = structure(list(), class = c("collector_guess", 
"collector")), skip = 1), class = "col_spec"))

谢谢!

这里有一个使用tidyverse的可能解决方案:

library(tidyverse)
examples %>% 
mutate_at(
vars(-matches("filename|LineTotal")),
~ .x * 1000 / LineTotal
)

我想这就是您想要的

1( 定义行总的列号

colname_tot = which(colnames(examples) == "LineTotal") -1

2( 使用apply在每行上执行此操作

examples = rbind(examples[, 1], as.data.frame(t(apply(examples[, -1], 1, function(x) {x = x*1000/x[colname_tot]})))

希望这能有所帮助,

我认为这就是您想要的(其中example是您的数据帧(

example[,2:ncol(example)] <- lapply(2:ncol(example), function(X) example[,X]*1000/sum(example[,X]))

编辑:一个问题是数据帧的第一列中有字符。这段新代码使用了您提供的示例数据。

最新更新