r-使用pivot_langer()使用REGEX tidyr收集数百个类似的变量

我有一个关于pivot_langer((的问题，这个问题以前似乎没有讨论过。该数据集有超过10000列，但只有242行。以下仅包含估计信息，在所有这些中有21个其他变量(较低的CI与亚型等(，它们也必须延长。

TEST_table <- structure(list(
Estimate.1.Country.... = c("45", "", "22", "39", "23", "2"),
Estimate.1.Year.Season = c("2017/18", "", "2017/18", "2012/13", "2013/14", ""),
Estimate.1.Objective.... = c("", "", "1A", "1A", "", ""),
Estimate.1.Risk.group.... = c("", "", "2", "2", "", ""),
Estimate.1.Risk.group.description = c("", "", "aged 65 years or older", "aged 65 years or older", "", ""),
Estimate.1.Age.range...lower = c("", "", "65", "65", "", ""),
Estimate.1.Age.range...upper = c("", "", "", "", "", ""),
Estimate.1.Sex = c("", "", "", "", "", ""),
Estimate.1.Vaccine.... = c("", "", "all vaccines", "all vaccines", "", ""),
Estimate.1.Match...y.n. = c("", "", "", "", "", ""),
Estimate.1.Vaccine.strains = c("", "", "", "", "", ""),
Estimate.1.Circulating.strains = c("A(H3N2) [dominant]","","","A(H3N2) [dominant]","A(H1N1)pdm09 [dominant]","A(H1N1)pdm09 [dominant]"),
Estimate.1.Outcome.... = c("1", "", "1", "1", "", ""),
Estimate.1.vs.virus..sub.type = c("", "", "influenza A and B", "influenza A", "", ""),
Estimate.1.Measure = c("", "", "aVE (%)", "aVE (%)", "", ""),
Estimate.1.Value = c("", "", "88.5", "34", "", ""),
Estimate.1.CI...lower = c(NA, NA, 38.9,-1, NA, NA),
Estimate.1.CI...upper = c(NA, NA, 97.8, 60, NA, NA),
Estimate.1.Comment = c("","",
"adjusted for age, sex, hospitalisations and GP visits in last 12 months, vaccination status for previous years, underling conditions, deterioration, dependency",
"adjusted for age, sex, underlying medical conditions, BMI, month of illness, and interval from ILI onset","",""),
Estimate.2.Country.... = c("", "", "22", "39", "", ""),
Estimate.2.Year.Season = c("", "", "2018/19", "2013/14", "", ""),
Estimate.2.Objective.... = c("", "", "1A", "1A", "", ""),
Estimate.2.Risk.group.... = c("", "", "2", "2", "", ""),
Estimate.2.Risk.group.description = c("", "", "aged 65 years or older", "aged 65 years or older", "", ""),
Estimate.2.n = c(NA_integer_,NA_integer_,NA_integer_,NA_integer_,NA_integer_,NA_integer_),
Estimate.2.Age.range...upper = c("", "", "", "", "", ""),
Estimate.2.Sex = c("", "", "", "", "", ""),
Estimate.2.Vaccine.... = c("", "", "influenza A and B", "all vaccines", "", ""),
Estimate.2.Match...y.n. = c("", "", "", "", "", ""),
Estimate.2.Vaccine.strains = c("", "", "", "", "", ""),
Estimate.2.Circulating.strains = c("", "", "", "A(H3N2) [dominant]", "", ""),
Estimate.2.Outcome.... = c("", "", "1", "1", "", ""),
Estimate.2.vs.virus..sub.type = c("", "", "influenza A and B", "influenza A", "", ""),
Estimate.2.Measure = c("", "", "aVE (%)", "aVE (%)", "", ""),
Estimate.2.Value = c(NA,NA, 61.7,-25, NA, NA),
Estimate.2.CI...lower = c(NA, NA,-59.9,-96, NA, NA),
Estimate.2.CI...upper = c(NA, NA, 90.9, 20, NA, NA),
Estimate.2.Comment = c("","",
"adjusted for age, sex, hospitalisations and GP visits in last 12 months, vaccination status for previous years, underling conditions, deterioration, dependency",
"adjusted for age, sex, underlying medical conditions, BMI, month of illness, and interval from ILI onset","",""),
Estimate.3.Country.... = c("45", "", "", "39", "", ""),
Estimate.3.Year.Season = c("2017/18", "", "", "2014/15", "", ""),
Estimate.3.Objective.... = c("1A", "", "", "1A", "", ""),
Estimate.3.Risk.group.... = c("2", "", "", "2", "", ""),
Estimate.3.Risk.group.description = c("aged 65 years or older", "", "", "aged 65 years or older", "", ""),
Estimate.3.n = c(NA_integer_,NA_integer_,NA_integer_,NA_integer_,NA_integer_,NA_integer_),
Estimate.3.N = c(NA_integer_,NA_integer_,NA_integer_,NA_integer_,NA_integer_,NA_integer_),
Estimate.3.Sex = c("", "", "", "", "", ""),
Estimate.3.Vaccine.... = c("1", "", "", "all vaccines", "", ""),
Estimate.3.Match...y.n. = c("", "", "", "", "", ""),
Estimate.3.Vaccine.strains = c("", "", "", "", "", ""),
Estimate.3.Circulating.strains = c("A(H3N2) [dominant]", "", "", "A(H3N2) [dominant]", "", ""),
Estimate.3.Outcome.... = c("1", "", "", "1", "", ""),
Estimate.3.vs.virus..sub.type = c("influenza A and B","", "", "influenza A", "", ""),
Estimate.3.Measure = c("uVE (%)","", "", "aVE (%)", "", ""),
Estimate.3.Value = c(4, NA, NA,-22, NA, NA),
Estimate.3.CI...lower = c(-13, NA, NA,-71, NA, NA),
Estimate.3.CI...upper = c(19, NA, NA, 13, NA, NA),
Estimate.3.Comment = c("","","",
"adjusted for age, sex, underlying medical conditions, BMI, month of illness, and interval from ILI onset","",""),
Estimate.4.Country.... = c("45", "", "", "39", "", "")),
row.names = c(NA, 6L),class = "data.frame")

正如在上面的数据中没有完全看到的那样，我有500列单独的值，我希望收集/透视更长的时间。因为它们没有用_之类的值分隔，所以我需要使可用的names_to=和names_patterns=一样多。

这是我的编码，但无论我如何输入REGEX代码，它似乎都不起作用。我需要22个模式来识别列名中的字符串，例如"字符串"；国家；或"；子类型"；并将它们作为相同的变量进行计数以达到延长的目的。

Data.L %>% pivot_longer(cols = 153:10652,
names_to = c('Country', '.value', 'Season', '.value', 'Objective', '.value', 'RiskGroup', '.value','RiskGroupDescription', '.value',
'n', '.value', 'N', '.value', 'AgeRangeLower', '.value', 'AgeRangerUpper', '.value', 'Sex', '.value', 'Vaccine(#)', '.value',
'VaccineMatch', '.value', 'VaccineStrains',  '.value', 'CirculatingStrains', '.value', 'Outcome', '.value',
'VirusSubtype', '.value', 'EstimateMeasure', '.value', 'EstimateValue', '.value',
'EstimateLowerCI', '.value', 'EstimateUpperCI', '.value', 'Comment', '.value'),
# names_prefix = 'Estimate.',
names_patterns = c())

我尝试的名称模式包括(但当然不限于(：

'(.*Country.*)', '(.*)Year(.*)', '(.*)Objective(.*)', '(.*)Risk.group..(.*)',
'(.*)Risk.group.d(.*)', '(.*)n(.*)', '(.*)N(.*)', '(.*)Age.range...lower(.*)',
'(.*)Age.range...upper(.*)', '(.*)Sex(.*)', '(.*)Vaccine....(.*)', '(.*)Match(.*)',
'(.*)Vaccine.s(.*)', '(.*)Cir(.*)', '(.*)Outcome(.*)', '(.*)vs.(.*)', '(.*)Mea(.*)', '(.*)Value(.*)',
'(.*)CI...lower(.*)', '(.*)CI...upper(.*)', '(.*)Com(.*)'

非常感谢任何建议。我希望使用regex来实现这一点，这样我就不会通过整形等手动编译。

干杯

您可以在names_to中设置".value"，并提供names_sep或names_pattern之一来指定列名的拆分方式。

library(tidyr)
TEST_table %>%
pivot_longer(
everything(),
names_to = c("Est", ".value"),
names_pattern = "([^.]+\.\d+)\.(.+)",
values_transform = list(Value = as.numeric)
)
# A tibble: 24 × 22
Est        Country.... Year.Season Objective.... Risk.group.... Risk.group.description   Age.range...lower Age.range...upper Sex   Vaccine....
<chr>      <chr>       <chr>       <chr>         <chr>          <chr>                    <chr>             <chr>             <chr> <chr>
1 Estimate.1 "45"        "2017/18"   ""            ""             ""                       ""                ""                ""    ""
2 Estimate.2 ""          ""          ""            ""             ""                        NA               ""                ""    ""
3 Estimate.3 "45"        "2017/18"   "1A"          "2"            "aged 65 years or older"  NA                NA               ""    "1"
4 Estimate.4 "45"         NA          NA            NA             NA                       NA                NA                NA    NA
5 Estimate.1 ""          ""          ""            ""             ""                       ""                ""                ""    ""
6 Estimate.2 ""          ""          ""            ""             ""                        NA               ""                ""    ""
7 Estimate.3 ""          ""          ""            ""             ""                        NA                NA               ""    ""
8 Estimate.4 ""           NA          NA            NA             NA                       NA                NA                NA    NA
9 Estimate.1 "22"        "2017/18"   "1A"          "2"            "aged 65 years or older" "65"              ""                ""    "all vaccines"     
10 Estimate.2 "22"        "2018/19"   "1A"          "2"            "aged 65 years or older"  NA               ""                ""    "influenza A and B"
# … with 14 more rows

注意：values_transform = list(Value = as.numeric)消除了以下错误：

Error in `pivot_longer_spec()`:
! Can't combine `Estimate.1.Value` <character> and `Estimate.2.Value` <double>.

相关内容

最新更新

热门标签：