analysis<-function(df,variable_list,type_list,group,group_list=0,t_test=FALSE)
{
library(carData)
library(car)
library(nortest)
df <- df[which(!is.na(df[,group])),] #筛选掉没有分组的观测
if(group_list == 0)
group_list<-as.character(as.data.frame(table(df[,group]))[,1]) #group的类名
n_group<-length(unique(na.omit(df[,group]))) #group类数
#生成分析结果的dataframe
df_result<-data.frame(Variables=c('N',variable_list),Overall=NA)
each_group_num<-as.data.frame(table(df[,group]))
df_result[1,'Overall']<-count(df)
for(i in 1:n_group) #添加group的列
{
# print(group_list[i])
df_result[,group_list[i]]=NA
df_result[1,group_list[i]]=each_group_num[which(each_group_num[,1]==group_list[i]),2]
}
#添加总的四分位数列
df_result[,'Median']=NA
for(i in 1:n_group) #添加group的四分位数列
{
df_result[,paste(group_list[i],'Median')]=NA
}
df_result[,'是否正态']<-NA
df_result[,'是否方差齐性']<-NA
df_result[,'anova p-value']<-NA
df_result[,'kruskal-wallis p-value']<-NA
df_result[,'卡方检验 p-value']<-NA
if(t_test==TRUE)
df_result[,'t检验 p-value']<-NA
df_result[,'wilcox秩和检验 p-value']<-NA
#遍历字段
for(i in seq(1,dim(df_result)[1]-1))
{
mystr=variable_list[i] #字段名
df2<-df
df2<-df2[which(!is.na(df2[,group])),] #去group字段na的数据
#查找是否有对应的flag字段,若有,筛选出flag=0的数据
mystr2=ifelse(substring(mystr,1,3)=='val',paste('flag_',substring(mystr,5,nchar(mystr)),sep=""),paste('flag_',mystr,sep=""))
if(mystr2 %in% colnames(df))
df2<-df2[which(df2[,mystr2]==0),]
#连续和分类变量分别分析
if(type_list[i]=='连续')
{
df2<-df2[which(!is.na(df2[,mystr])),] #去自变量na的数据
#计算均值标准差,无法转成数值型的去掉
df_result[i+1,'Overall']=paste(as.character(formatC(mean(as.numeric(df2[,mystr]),na.rm = T),digits = 2, format = "f")),' ± ',as.character(formatC(sd(as.numeric(df2[,mystr]),na.rm = T),digits = 2, format = "f")), sep = "")
for(j in 1:n_group)
{
df_result[i+1,group_list[j]]=paste(as.character(formatC(mean(as.numeric(df2[which(df2[,group]==group_list[j]),mystr]),na.rm = T),digits = 2, format = "f")),' ± ',as.character(formatC(sd(as.numeric(df2[which(df2[,group]==group_list[j]),mystr]),na.rm = T),digits = 2, format = "f")),sep = "")
df_result[i+1,'Median']=paste(as.character(formatC(median(as.numeric(df2[,mystr]),na.rm = T),digits = 2, format = "f")),' (',as.character(formatC(quantile(as.numeric(df2[,mystr]),na.rm = T)[2],digits = 2, format = "f")),', ',as.character(formatC(quantile(as.numeric(df2[,mystr]),na.rm = T)[4],digits = 2, format = "f")),') ',sep = "")
df_result[i+1,paste(group_list[j],'Median')]=paste(as.character(formatC(median(as.numeric(df2[which(df2[,group]==group_list[j]),mystr]),na.rm = T),digits = 2, format = "f")),' (',as.character(formatC(quantile(as.numeric(df2[which(df2[,group]==group_list[j]),mystr]),na.rm = T)[2],digits = 2, format = "f")),', ',as.character(formatC(quantile(as.numeric(df2[which(df2[,group]==group_list[j]),mystr]),na.rm = T)[4],digits = 2, format = "f")),') ',sep = "")
}
#是否正态,样本量>5000时不能用shapiro 检验
df_result[i+1,'是否正态']=ifelse(ad.test(as.numeric(df2[,mystr]))$p.value>0.05,'是','否')
#是否方差齐性
df_result[i+1,'是否方差齐性']=ifelse(leveneTest(as.numeric(df2[,mystr])~df2[,group])[1,3]>0.05,'是','否')
#anova p-value
anova_p <- summary(aov(as.numeric(df2[,mystr])~df2[,group]))[[1]]["Pr(>F)"][1,1]
df_result[i+1,'anova p-value']<-ifelse(anova_p<0.000001,formatC(anova_p,format='e',digits=2),formatC(anova_p,digits = 6, format = "f"))
#kruskal-wallis p-value
kw_p <- kruskal.test(as.numeric(df2[,mystr])~df2[,group])$p.value
df_result[i+1,'kruskal-wallis p-value']<-ifelse(kw_p<0.000001,formatC(kw_p,format='e',digits=2),formatC(kw_p,digits = 6, format = "f"))
#t-test p-value
if(t_test==TRUE)
{
variance_equal <- ifelse(df_result[i+1,'是否方差齐性']=='是',TRUE,FALSE) #根据方差是否齐性选择t检验函数参数
t_p <- t.test(as.numeric(df2[,mystr])~df2[,group],var.equal=variance_equal)$p.value
df_result[i+1,'t检验 p-value']<-ifelse(t_p<0.000001,formatC(t_p,format='e',digits=2),formatC(t_p,digits = 6, format = "f"))
}
}else
{
#计算freq和占比
t<-as.data.frame(table(df2[,mystr]))
df_result[i+1,'Overall']=paste(as.character(t[which(t[,1]==1),'Freq']),' (',as.character(formatC(t[which(t[,1]==1),'Freq']/dim(df)[1]*100,digits = 2, format = "f")),'%)',sep = "")
t<-as.data.frame(table(df2[,c(mystr,group)]))
for(j in 1:n_group)
{
df_result[i+1,group_list[j]]=paste(as.character(t[which(t[,1]=='1' & t[,2]==group_list[j]),'Freq']),' (',
as.character(formatC(t[which(t[,1]=='1' & t[,2]==group_list[j]),'Freq']/each_group_num[which(each_group_num[,1]==group_list[j]),2]*100,digits = 2, format = "f")),
'%)',sep='')
}
#卡方检验 p-value
chisq_p <- chisq.test(as.matrix(table(df2[,c(mystr,group)])))$p.value
df_result[i+1,'卡方检验 p-value']<-ifelse(chisq_p<0.000001,formatC(chisq_p,format='e',digits=2),formatC(chisq_p,digits =6, format = "f"))
}
}
print("0.00e+00代表该数小于1e-323")
return(df_result)
}
dev_yinshi <- dbGetQuery(con,"select * from dev_yinshi")
data_v1_new<-dbGetQuery(con,"select * from data_v1_new")
dev_yinshi<-dev_yinshi[which(!(is.na(dev_yinshi$id_980_v1) &
is.na(dev_yinshi$id_982_v1) &
is.na(dev_yinshi$id_981_v1) &
is.na(dev_yinshi$id_983_v1) &
is.na(dev_yinshi$id_984_v1) )),]
dev_yinshi$group1<-NA
dev_yinshi$group1[dev_yinshi$id_980_v1==0]<-0
dev_yinshi$group1[dev_yinshi$id_980_v1==1]<-1
dev_yinshi$group2<-NA
dev_yinshi$group2[dev_yinshi$id_982_v1==98202]<-1
dev_yinshi$group2[dev_yinshi$id_982_v1==98201 |
dev_yinshi$id_982_v1==98203 ]<-0
dev_yinshi$group3<-NA
dev_yinshi$group3[dev_yinshi$id_981_v1==98101 |
dev_yinshi$id_981_v1==98102 ]<-0
dev_yinshi$group3[dev_yinshi$id_981_v1==98103 |
dev_yinshi$id_981_v1==98104 |
dev_yinshi$id_981_v1==98105 ]<-1
dev_yinshi$group4<-NA
dev_yinshi$group4[dev_yinshi$id_983_v1==98301]<-1
dev_yinshi$group4[dev_yinshi$id_983_v1==98302 |
dev_yinshi$id_983_v1==98303 |
dev_yinshi$id_983_v1==98304 ]<-0
dev_yinshi$group5<-NA
dev_yinshi$group5[dev_yinshi$id_984_v1==98401 |
dev_yinshi$id_984_v1==98402 |
dev_yinshi$id_984_v1==98403 ]<-1
dev_yinshi$group5[dev_yinshi$id_984_v1==98404]<-0
dev_yinshi$score<-ifelse( ifelse(is.na(dev_yinshi$group1),0,dev_yinshi$group1)==1,1,0)+
ifelse( ifelse(is.na(dev_yinshi$group2),0,dev_yinshi$group2)==1,1,0)+
ifelse( ifelse(is.na(dev_yinshi$group3),0,dev_yinshi$group3)==1,1,0)+
ifelse( ifelse(is.na(dev_yinshi$group4),0,dev_yinshi$group4)==1,1,0)+
ifelse( ifelse(is.na(dev_yinshi$group5),0,dev_yinshi$group5)==1,1,0)
dev_yinshi$score[is.na(dev_yinshi$group1) & is.na(dev_yinshi$group2) & is.na(dev_yinshi$group3) &
is.na(dev_yinshi$group4) & is.na(dev_yinshi$group5) ]<-NA
dev_yinshi$score_2<-ifelse( ifelse(is.na(dev_yinshi$group1),0,dev_yinshi$group1)==1,1,0)+
ifelse( ifelse(is.na(dev_yinshi$group2),0,dev_yinshi$group2)==1,1,0)+
ifelse( ifelse(is.na(dev_yinshi$group3),0,dev_yinshi$group3)==1,1,0)+
ifelse( ifelse(is.na(dev_yinshi$group4),0,dev_yinshi$group4)==1,1,0)+
ifelse( ifelse(is.na(dev_yinshi$group5),0,dev_yinshi$group5)==1,1,0)
dev_yinshi$score_2[is.na(dev_yinshi$group1) | is.na(dev_yinshi$group2) | is.na(dev_yinshi$group3) |
is.na(dev_yinshi$group4) | is.na(dev_yinshi$group5) ]<-NA
dev_yinshi<-dev_yinshi[which(!is.na(dev_yinshi$score_2)),]
###id_979
data_1<-merge(dev_yinshi,data_v1_new,by='user_id')
data_1$db_duration_year<-as.numeric(data_1$diabetic_duration)/12
data_1$ideal_smoking<-NA
data_1$ideal_smoking[which(data_1$smoke_info==3)]<-as.character(1)
data_1$ideal_smoking[which(data_1$smoke_info==1 | data_1$smoke_info==2)]<-as.character(0)
data_1$current_drinker<-NA
data_1$current_drinker[which(data_1$drink_info==3)]<-as.character(1)
data_1$current_drinker[which(data_1$drink_info==1 | data_1$drink_info==2)]<-as.character(0)
data_1$avg_pwv <- NA
data_1$val_929003[which(data_1$flag_929003!=0)]<-NA
data_1$val_929004[which(data_1$flag_929004!=0)]<-NA
for (i in 1:nrow(data_1)) {
if(is.na(data_1$val_929003[i]) & !is.na(data_1$val_929004[i])) {
data_1$avg_pwv[i]<-as.numeric(data_1$val_929004[i])
}
if(!is.na(data_1$val_929003[i]) & is.na(data_1$val_929004[i])) {
data_1$avg_pwv[i]<-as.numeric(data_1$val_929003[i])
}
if (!is.na(data_1$val_929003[i]) & !is.na(data_1$val_929004[i])) {
data_1$avg_pwv[i]<-(as.numeric(data_1$val_929004[i])+as.numeric(data_1$val_929003[i]))*0.5
}
}
data_1$ABI <- NA
data_1$val_929001[which(data_1$flag_929001!=0)]<-NA
data_1$val_929002[which(data_1$flag_929002!=0)]<-NA
for (i in 1:nrow(data_1)) {
if(is.na(data_1$val_929001[i]) & !is.na(data_1$val_929002[i])) {
data_1$ABI[i]<-as.numeric(data_1$val_929002[i])
}
if(!is.na(data_1$val_929001[i]) & is.na(data_1$val_929002[i])) {
data_1$ABI[i]<-as.numeric(data_1$val_929001[i])
}
if (!is.na(data_1$val_929001[i]) & !is.na(data_1$val_929002[i])) {
data_1$ABI[i]<-(as.numeric(data_1$val_929002[i])+as.numeric(data_1$val_929001[i]))*0.5
}
}
data_1$IMT <- NA
data_1$val_925001[which(data_1$flag_925001!=0)]<-NA
data_1$val_925009[which(data_1$flag_925009!=0)]<-NA
for (i in 1:nrow(data_1)) {
if(is.na(data_1$val_925001[i]) & !is.na(data_1$val_925009[i])) {
data_1$IMT[i]<-as.numeric(data_1$val_925009[i])
}
if(!is.na(data_1$val_925001[i]) & is.na(data_1$val_925009[i])) {
data_1$IMT[i]<-as.numeric(data_1$val_925001[i])
}
if (!is.na(data_1$val_925001[i]) & !is.na(data_1$val_925009[i])) {
data_1$IMT[i]<-(as.numeric(data_1$val_925009[i])+as.numeric(data_1$val_925001[i]))*0.5
}
}
data_1$DR<-NA
data_1[which(data_1$manual_read_res==0),'DR']<-0
data_1[which(data_1$manual_read_res>=1),'DR']<-1
group<-'id_979'
variable_list<-c('age','sex','education_level',
'diabetes_history','id_477','id_478','db_duration_year',
'ideal_smoking','current_drinker','bmi','weight','vat',
'waistline','hips','whr','sbp','dbp',
'val_875001','val_875011','homa2','val_875004','val_875014',
'val_876','val_878040','val_878041','val_878042','val_901013',
'avg_pwv','ABI','IMT','DR','group2',
'group3','group5','group4')
type_list<-c('连续','离散','离散',
'离散','离散','离散','连续',
'离散','离散','连续','连续','连续',
'连续','连续','连续','连续','连续',
'连续','连续','连续','连续','连续',
'连续','连续','连续','连续','连续',
'连续','连续','连续','离散','离散',
'离散','离散','离散')
df_result<-analysis(data_1,variable_list,type_list,group,group_list=0,t_test=FALSE)
df_result$p<-ifelse(!is.na(df_result$`anova p-value`),df_result$`anova p-value`,df_result$`卡方检验 p-value`)
tem1<-c('Variables','Overall','1','2','3','4','p')
df_result_2<-df_result[,tem1]