analysis<-function(df,variable_list,type_list,group,group_list=0,t_test=FALSE){ library(carData) library(car) library(nortest) df <- df[which(!is.na(df[,group])),] #筛选掉没有分组的观测 if(group_list == 0) group_list<-as.character(as.data.frame(table(df[,group]))[,1]) #group的类名 n_group<-length(unique(na.omit(df[,group]))) #group类数 #生成分析结果的dataframe df_result<-data.frame(Variables=c('N',variable_list),Overall=NA) each_group_num<-as.data.frame(table(df[,group])) df_result[1,'Overall']<-count(df) for(i in 1:n_group) #添加group的列 { # print(group_list[i]) df_result[,group_list[i]]=NA df_result[1,group_list[i]]=each_group_num[which(each_group_num[,1]==group_list[i]),2] } #添加总的四分位数列 df_result[,'Median']=NA for(i in 1:n_group) #添加group的四分位数列 { df_result[,paste(group_list[i],'Median')]=NA } df_result[,'是否正态']<-NA df_result[,'是否方差齐性']<-NA df_result[,'anova p-value']<-NA df_result[,'kruskal-wallis p-value']<-NA df_result[,'卡方检验 p-value']<-NA if(t_test==TRUE) df_result[,'t检验 p-value']<-NA df_result[,'wilcox秩和检验 p-value']<-NA #遍历字段 for(i in seq(1,dim(df_result)[1]-1)) { mystr=variable_list[i] #字段名 df2<-df df2<-df2[which(!is.na(df2[,group])),] #去group字段na的数据 #查找是否有对应的flag字段,若有,筛选出flag=0的数据 mystr2=ifelse(substring(mystr,1,3)=='val',paste('flag_',substring(mystr,5,nchar(mystr)),sep=""),paste('flag_',mystr,sep="")) if(mystr2 %in% colnames(df)) df2<-df2[which(df2[,mystr2]==0),] #连续和分类变量分别分析 if(type_list[i]=='连续') { df2<-df2[which(!is.na(df2[,mystr])),] #去自变量na的数据 #计算均值标准差,无法转成数值型的去掉 df_result[i+1,'Overall']=paste(as.character(formatC(mean(as.numeric(df2[,mystr]),na.rm = T),digits = 2, format = "f")),' ± ',as.character(formatC(sd(as.numeric(df2[,mystr]),na.rm = T),digits = 2, format = "f")), sep = "") for(j in 1:n_group) { df_result[i+1,group_list[j]]=paste(as.character(formatC(mean(as.numeric(df2[which(df2[,group]==group_list[j]),mystr]),na.rm = T),digits = 2, format = "f")),' ± ',as.character(formatC(sd(as.numeric(df2[which(df2[,group]==group_list[j]),mystr]),na.rm = T),digits = 2, format = "f")),sep = "") df_result[i+1,'Median']=paste(as.character(formatC(median(as.numeric(df2[,mystr]),na.rm = T),digits = 2, format = "f")),' (',as.character(formatC(quantile(as.numeric(df2[,mystr]),na.rm = T)[2],digits = 2, format = "f")),', ',as.character(formatC(quantile(as.numeric(df2[,mystr]),na.rm = T)[4],digits = 2, format = "f")),') ',sep = "") df_result[i+1,paste(group_list[j],'Median')]=paste(as.character(formatC(median(as.numeric(df2[which(df2[,group]==group_list[j]),mystr]),na.rm = T),digits = 2, format = "f")),' (',as.character(formatC(quantile(as.numeric(df2[which(df2[,group]==group_list[j]),mystr]),na.rm = T)[2],digits = 2, format = "f")),', ',as.character(formatC(quantile(as.numeric(df2[which(df2[,group]==group_list[j]),mystr]),na.rm = T)[4],digits = 2, format = "f")),') ',sep = "") } #是否正态,样本量>5000时不能用shapiro 检验 df_result[i+1,'是否正态']=ifelse(ad.test(as.numeric(df2[,mystr]))$p.value>0.05,'是','否') #是否方差齐性 df_result[i+1,'是否方差齐性']=ifelse(leveneTest(as.numeric(df2[,mystr])~df2[,group])[1,3]>0.05,'是','否') #anova p-value anova_p <- summary(aov(as.numeric(df2[,mystr])~df2[,group]))[[1]]["Pr(>F)"][1,1] df_result[i+1,'anova p-value']<-ifelse(anova_p<0.000001,formatC(anova_p,format='e',digits=2),formatC(anova_p,digits = 6, format = "f")) #kruskal-wallis p-value kw_p <- kruskal.test(as.numeric(df2[,mystr])~df2[,group])$p.value df_result[i+1,'kruskal-wallis p-value']<-ifelse(kw_p<0.000001,formatC(kw_p,format='e',digits=2),formatC(kw_p,digits = 6, format = "f")) #t-test p-value if(t_test==TRUE) { variance_equal <- ifelse(df_result[i+1,'是否方差齐性']=='是',TRUE,FALSE) #根据方差是否齐性选择t检验函数参数 t_p <- t.test(as.numeric(df2[,mystr])~df2[,group],var.equal=variance_equal)$p.value df_result[i+1,'t检验 p-value']<-ifelse(t_p<0.000001,formatC(t_p,format='e',digits=2),formatC(t_p,digits = 6, format = "f")) } }else { #计算freq和占比 t<-as.data.frame(table(df2[,mystr])) df_result[i+1,'Overall']=paste(as.character(t[which(t[,1]==1),'Freq']),' (',as.character(formatC(t[which(t[,1]==1),'Freq']/dim(df)[1]*100,digits = 2, format = "f")),'%)',sep = "") t<-as.data.frame(table(df2[,c(mystr,group)])) for(j in 1:n_group) { df_result[i+1,group_list[j]]=paste(as.character(t[which(t[,1]=='1' & t[,2]==group_list[j]),'Freq']),' (', as.character(formatC(t[which(t[,1]=='1' & t[,2]==group_list[j]),'Freq']/each_group_num[which(each_group_num[,1]==group_list[j]),2]*100,digits = 2, format = "f")), '%)',sep='') } #卡方检验 p-value chisq_p <- chisq.test(as.matrix(table(df2[,c(mystr,group)])))$p.value df_result[i+1,'卡方检验 p-value']<-ifelse(chisq_p<0.000001,formatC(chisq_p,format='e',digits=2),formatC(chisq_p,digits =6, format = "f")) } } print("0.00e+00代表该数小于1e-323") return(df_result)}dev_yinshi <- dbGetQuery(con,"select * from dev_yinshi")data_v1_new<-dbGetQuery(con,"select * from data_v1_new")dev_yinshi<-dev_yinshi[which(!(is.na(dev_yinshi$id_980_v1) & is.na(dev_yinshi$id_982_v1) & is.na(dev_yinshi$id_981_v1) & is.na(dev_yinshi$id_983_v1) & is.na(dev_yinshi$id_984_v1) )),]dev_yinshi$group1<-NAdev_yinshi$group1[dev_yinshi$id_980_v1==0]<-0dev_yinshi$group1[dev_yinshi$id_980_v1==1]<-1dev_yinshi$group2<-NAdev_yinshi$group2[dev_yinshi$id_982_v1==98202]<-1dev_yinshi$group2[dev_yinshi$id_982_v1==98201 | dev_yinshi$id_982_v1==98203 ]<-0dev_yinshi$group3<-NAdev_yinshi$group3[dev_yinshi$id_981_v1==98101 | dev_yinshi$id_981_v1==98102 ]<-0dev_yinshi$group3[dev_yinshi$id_981_v1==98103 | dev_yinshi$id_981_v1==98104 | dev_yinshi$id_981_v1==98105 ]<-1dev_yinshi$group4<-NAdev_yinshi$group4[dev_yinshi$id_983_v1==98301]<-1dev_yinshi$group4[dev_yinshi$id_983_v1==98302 | dev_yinshi$id_983_v1==98303 | dev_yinshi$id_983_v1==98304 ]<-0dev_yinshi$group5<-NAdev_yinshi$group5[dev_yinshi$id_984_v1==98401 | dev_yinshi$id_984_v1==98402 | dev_yinshi$id_984_v1==98403 ]<-1dev_yinshi$group5[dev_yinshi$id_984_v1==98404]<-0dev_yinshi$score<-ifelse( ifelse(is.na(dev_yinshi$group1),0,dev_yinshi$group1)==1,1,0)+ ifelse( ifelse(is.na(dev_yinshi$group2),0,dev_yinshi$group2)==1,1,0)+ ifelse( ifelse(is.na(dev_yinshi$group3),0,dev_yinshi$group3)==1,1,0)+ ifelse( ifelse(is.na(dev_yinshi$group4),0,dev_yinshi$group4)==1,1,0)+ ifelse( ifelse(is.na(dev_yinshi$group5),0,dev_yinshi$group5)==1,1,0)dev_yinshi$score[is.na(dev_yinshi$group1) & is.na(dev_yinshi$group2) & is.na(dev_yinshi$group3) & is.na(dev_yinshi$group4) & is.na(dev_yinshi$group5) ]<-NAdev_yinshi$score_2<-ifelse( ifelse(is.na(dev_yinshi$group1),0,dev_yinshi$group1)==1,1,0)+ ifelse( ifelse(is.na(dev_yinshi$group2),0,dev_yinshi$group2)==1,1,0)+ ifelse( ifelse(is.na(dev_yinshi$group3),0,dev_yinshi$group3)==1,1,0)+ ifelse( ifelse(is.na(dev_yinshi$group4),0,dev_yinshi$group4)==1,1,0)+ ifelse( ifelse(is.na(dev_yinshi$group5),0,dev_yinshi$group5)==1,1,0)dev_yinshi$score_2[is.na(dev_yinshi$group1) | is.na(dev_yinshi$group2) | is.na(dev_yinshi$group3) | is.na(dev_yinshi$group4) | is.na(dev_yinshi$group5) ]<-NAdev_yinshi<-dev_yinshi[which(!is.na(dev_yinshi$score_2)),]###id_979data_1<-merge(dev_yinshi,data_v1_new,by='user_id')data_1$db_duration_year<-as.numeric(data_1$diabetic_duration)/12data_1$ideal_smoking<-NAdata_1$ideal_smoking[which(data_1$smoke_info==3)]<-as.character(1)data_1$ideal_smoking[which(data_1$smoke_info==1 | data_1$smoke_info==2)]<-as.character(0)data_1$current_drinker<-NAdata_1$current_drinker[which(data_1$drink_info==3)]<-as.character(1)data_1$current_drinker[which(data_1$drink_info==1 | data_1$drink_info==2)]<-as.character(0)data_1$avg_pwv <- NAdata_1$val_929003[which(data_1$flag_929003!=0)]<-NAdata_1$val_929004[which(data_1$flag_929004!=0)]<-NAfor (i in 1:nrow(data_1)) { if(is.na(data_1$val_929003[i]) & !is.na(data_1$val_929004[i])) { data_1$avg_pwv[i]<-as.numeric(data_1$val_929004[i]) } if(!is.na(data_1$val_929003[i]) & is.na(data_1$val_929004[i])) { data_1$avg_pwv[i]<-as.numeric(data_1$val_929003[i]) } if (!is.na(data_1$val_929003[i]) & !is.na(data_1$val_929004[i])) { data_1$avg_pwv[i]<-(as.numeric(data_1$val_929004[i])+as.numeric(data_1$val_929003[i]))*0.5 }}data_1$ABI <- NAdata_1$val_929001[which(data_1$flag_929001!=0)]<-NAdata_1$val_929002[which(data_1$flag_929002!=0)]<-NAfor (i in 1:nrow(data_1)) { if(is.na(data_1$val_929001[i]) & !is.na(data_1$val_929002[i])) { data_1$ABI[i]<-as.numeric(data_1$val_929002[i]) } if(!is.na(data_1$val_929001[i]) & is.na(data_1$val_929002[i])) { data_1$ABI[i]<-as.numeric(data_1$val_929001[i]) } if (!is.na(data_1$val_929001[i]) & !is.na(data_1$val_929002[i])) { data_1$ABI[i]<-(as.numeric(data_1$val_929002[i])+as.numeric(data_1$val_929001[i]))*0.5 }}data_1$IMT <- NAdata_1$val_925001[which(data_1$flag_925001!=0)]<-NAdata_1$val_925009[which(data_1$flag_925009!=0)]<-NAfor (i in 1:nrow(data_1)) { if(is.na(data_1$val_925001[i]) & !is.na(data_1$val_925009[i])) { data_1$IMT[i]<-as.numeric(data_1$val_925009[i]) } if(!is.na(data_1$val_925001[i]) & is.na(data_1$val_925009[i])) { data_1$IMT[i]<-as.numeric(data_1$val_925001[i]) } if (!is.na(data_1$val_925001[i]) & !is.na(data_1$val_925009[i])) { data_1$IMT[i]<-(as.numeric(data_1$val_925009[i])+as.numeric(data_1$val_925001[i]))*0.5 }}data_1$DR<-NAdata_1[which(data_1$manual_read_res==0),'DR']<-0data_1[which(data_1$manual_read_res>=1),'DR']<-1group<-'id_979'variable_list<-c('age','sex','education_level', 'diabetes_history','id_477','id_478','db_duration_year', 'ideal_smoking','current_drinker','bmi','weight','vat', 'waistline','hips','whr','sbp','dbp', 'val_875001','val_875011','homa2','val_875004','val_875014', 'val_876','val_878040','val_878041','val_878042','val_901013', 'avg_pwv','ABI','IMT','DR','group2', 'group3','group5','group4')type_list<-c('连续','离散','离散', '离散','离散','离散','连续', '离散','离散','连续','连续','连续', '连续','连续','连续','连续','连续', '连续','连续','连续','连续','连续', '连续','连续','连续','连续','连续', '连续','连续','连续','离散','离散', '离散','离散','离散')df_result<-analysis(data_1,variable_list,type_list,group,group_list=0,t_test=FALSE)df_result$p<-ifelse(!is.na(df_result$`anova p-value`),df_result$`anova p-value`,df_result$`卡方检验 p-value`)tem1<-c('Variables','Overall','1','2','3','4','p')df_result_2<-df_result[,tem1]