层次数据列表R或Python中的数据包装



我有一些数据是以我试图为客户获取的格式提供给我的。它是分层数据,但并没有填写所有数据,而且由于涉及到太多不同的子级别,你不能简单地填写。数字总是4位数,表示特定的东西。

这是一个可以发送到具有数千行数据的数十个子组的报告。

这是R:中的一个示例

L1 <- c("Main1", rep(NA, 21), "Main2", "Main3")
L2 <- c(NA, "Sub2_1", rep(NA, 22))
L3 <- c(NA, NA, "Sub3_1", rep(NA, 17), "Sub3_2", rep(NA, 3))
L4 <- c(rep(NA, 3), "Sub4_1", rep(NA, 9), "Sub4_2", rep(NA, 7), "0015", rep(NA, 2))
L5 <- c(rep(NA, 4), "Sub5_1", NA, NA, "Sub5_2", NA, "Sub5_3", rep(NA, 4), "Sub5_5", rep(NA, 9))
L6 <- c(rep(NA, 5), "1111", "2885", NA, "0001", NA, "Sub6_1", rep(NA, 4), "Sub6_2", rep(NA, 8))
L7 <- c(rep(NA, 11), "Sub7_1", rep(NA, 4), "Sub7_2", rep(NA, 7))
L8 <- c(rep(NA, 12), "0011", rep(NA, 4), "9494", "Sub8_1", rep(NA, 5))
L9 <- c(rep(NA, 19), "8479", rep(NA, 4))
df <- data.frame(L1, L2, L3, L4, L5, L6, L7, L8, L9)

我想要这样的输出,因为四位数的"代码"是我们真正需要查找的:

code_f <- c("1111", "2885", "0001", "0011", "9494", "8479", "0015", NA, NA)
L1_f <- c(rep("Main1", 7), "Main2", "Main3")
L2_f <- c(rep("Sub2_1", 7), NA, NA)
L3_f <- c(rep("Sub3_1", 6), "Sub3_2", NA, NA)
L4_f <- c(rep("Sub4_1", 4), rep("Sub4_2", 2), rep(NA, 3))
L5_f <- c(rep("Sub5_1", 2), "Sub5_2", "Sub5_3", rep("Sub5_5", 2), rep(NA, 3))
L6_f <- c(rep(NA, 3), "Sub6_1", rep("Sub6_3", 2), rep(NA, 3))          
L7_f <- c(rep(NA, 3), "Sub7_1", rep("Sub7_2", 2), rep(NA, 3))
L8_f <- c(rep(NA, 5), "Sub8_1", rep(NA, 3))
df_f <- data.frame(code_f, L1_f, L2_f, L3_f, L4_f, L5_f, L6_f, L7_f, L8_f)

我在您的数据中看不到0015,所以无法判断它来自哪里。但根据您提供的内容,我们可以做到:

mm = function(data,i=1){
dat = tidyr::fill(data,!!!names(data)[i])%>%
group_by(.dots=names(data)[1:i])
if(i<ncol(dat)) mm(dat,i+1) else data
}
df%>%
mm%>%
{b =lift(paste)(.); b[grepl("\b\d+\b",b)| rowSums(is.na({.}[-1]))==ncol(df)-1]}%>%
sub("(.*?)(\b\d+\b|NA)","\2 \1",.)%>%
read.table(text=.,fill=T,colClasses = "character")%>%
mutate(r=rowSums(is.na(.))==ncol(.)-1)%>%
group_by(V2)%>%
filter(n()==1 & r|!r)%>%
select(-r)%>%data.frame()
V1    V2     V3     V4     V5     V6     V7     V8     V9
1 1111 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1   <NA>   <NA>   <NA>
2 2885 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1   <NA>   <NA>   <NA>
3 0001 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_2   <NA>   <NA>   <NA>
4 0011 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_3 Sub6_1 Sub7_1   <NA>
5 9494 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2   <NA>
6 8479 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2 Sub8_1
7   15 Main1 Sub2_1 Sub3_2   <NA>   <NA>   <NA>   <NA>   <NA>
8 <NA> Main2   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>
9 <NA> Main3   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>

python版本将是:

import pandas as pd
import numpy as np
import re
#df = pd.read_clipboard()
#df[df=="<NA>"]=np.nan
#df['L1']=df['L1'].ffill()
def mmpy(data,m,i=0):
data = data.copy(deep=True)
data.iloc[:,i] = m[data.columns[i]].ffill()
m = data.groupby(list(data.columns[0:i+1]))
if i < len(data.columns)-1:  return mmpy(data,m,i+1)
return data

s = mmpy(df,df.copy())
a = "n".join([" ".join([str(k) for k in i.values()]) for i in s.T.to_dict().values()])
b = re.sub(r"^(.*?)(bd+b|nan)",r"2 1",a,flags=re.M)
w = pd.DataFrame([i.split() for i in re.findall(r"^d+.*$|.*MainS* \D*$",b,re.M)])
0      1       2       3       4       5       6       7       8
0   nan  Main1     nan     nan     nan     nan     nan     nan     nan
1  1111  Main1  Sub2_1  Sub3_1  Sub4_1  Sub5_1     nan     nan     nan
2  2885  Main1  Sub2_1  Sub3_1  Sub4_1  Sub5_1     nan     nan     nan
3  0001  Main1  Sub2_1  Sub3_1  Sub4_1  Sub5_2     nan     nan     nan
4  0011  Main1  Sub2_1  Sub3_1  Sub4_1  Sub5_3  Sub6_1  Sub7_1     nan
5  9494  Main1  Sub2_1  Sub3_1  Sub4_2  Sub5_5  Sub6_2  Sub7_2     nan
6  8479  Main1  Sub2_1  Sub3_1  Sub4_2  Sub5_5  Sub6_2  Sub7_2  Sub8_1
7    15  Main1  Sub2_1  Sub3_2     nan     nan     nan     nan     nan
8   nan  Main2     nan     nan     nan     nan     nan     nan     nan
9   nan  Main3     nan     nan     nan     nan     nan     nan     nan

不确定我是否100%正确回答了你的问题,但这似乎复制了你想要的输出(假设df中没有一些额外的数据;如Onyambu的评论中所述(。

#change format of data
vec=c(t(as.matrix(df)))
subLevels=2:8
#regex patterns for 4-digit number and levels
patterns=c("[0-9]{4,4}","Main[0-9]{1,}",paste0("Sub",2:maxSub,"_[0-9]{1,}"))
#find indices for each level
idxList=lapply(patterns,grep,vec)
#replace all data that does not correspond to a given level by NA
valList=lapply(idxList,function(x) {tmp=vec;tmp[-x]=NA;tmp})
#the zoo library has a function to move missing values forward -> na.locf
library(zoo)
#for each 4-digit number and each level, find the respective level-string 
data.frame(code_f=na.omit(valList[[1]]),
do.call("cbind",
lapply(valList[-1],
function(x) na.locf(x,na.rm=FALSE)[idxList[[1]]])))
#   code_f    X1     X2     X3     X4     X5     X6     X7     X8
# 1   1111 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1   <NA>   <NA>   <NA>
# 2   2885 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1   <NA>   <NA>   <NA>
# 3   0001 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_2   <NA>   <NA>   <NA>
# 4   0011 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_3 Sub6_1 Sub7_1   <NA>
# 5   9494 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2   <NA>
# 6   8479 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2 Sub8_1

最新更新