1. analysis<-function(df,variable_list,type_list,group,group_list=0,t_test=FALSE)
    2. {
    3. library(carData)
    4. library(car)
    5. library(nortest)
    6. df <- df[which(!is.na(df[,group])),] #筛选掉没有分组的观测
    7. if(group_list == 0)
    8. group_list<-as.character(as.data.frame(table(df[,group]))[,1]) #group的类名
    9. n_group<-length(unique(na.omit(df[,group]))) #group类数
    10. #生成分析结果的dataframe
    11. df_result<-data.frame(Variables=c('N',variable_list),Overall=NA)
    12. each_group_num<-as.data.frame(table(df[,group]))
    13. df_result[1,'Overall']<-count(df)
    14. for(i in 1:n_group) #添加group的列
    15. {
    16. # print(group_list[i])
    17. df_result[,group_list[i]]=NA
    18. df_result[1,group_list[i]]=each_group_num[which(each_group_num[,1]==group_list[i]),2]
    19. }
    20. #添加总的四分位数列
    21. df_result[,'Median']=NA
    22. for(i in 1:n_group) #添加group的四分位数列
    23. {
    24. df_result[,paste(group_list[i],'Median')]=NA
    25. }
    26. df_result[,'是否正态']<-NA
    27. df_result[,'是否方差齐性']<-NA
    28. df_result[,'anova p-value']<-NA
    29. df_result[,'kruskal-wallis p-value']<-NA
    30. df_result[,'卡方检验 p-value']<-NA
    31. if(t_test==TRUE)
    32. df_result[,'t检验 p-value']<-NA
    33. df_result[,'wilcox秩和检验 p-value']<-NA
    34. #遍历字段
    35. for(i in seq(1,dim(df_result)[1]-1))
    36. {
    37. mystr=variable_list[i] #字段名
    38. df2<-df
    39. df2<-df2[which(!is.na(df2[,group])),] #去group字段na的数据
    40. #查找是否有对应的flag字段,若有,筛选出flag=0的数据
    41. mystr2=ifelse(substring(mystr,1,3)=='val',paste('flag_',substring(mystr,5,nchar(mystr)),sep=""),paste('flag_',mystr,sep=""))
    42. if(mystr2 %in% colnames(df))
    43. df2<-df2[which(df2[,mystr2]==0),]
    44. #连续和分类变量分别分析
    45. if(type_list[i]=='连续')
    46. {
    47. df2<-df2[which(!is.na(df2[,mystr])),] #去自变量na的数据
    48. #计算均值标准差,无法转成数值型的去掉
    49. df_result[i+1,'Overall']=paste(as.character(formatC(mean(as.numeric(df2[,mystr]),na.rm = T),digits = 2, format = "f")),' ± ',as.character(formatC(sd(as.numeric(df2[,mystr]),na.rm = T),digits = 2, format = "f")), sep = "")
    50. for(j in 1:n_group)
    51. {
    52. df_result[i+1,group_list[j]]=paste(as.character(formatC(mean(as.numeric(df2[which(df2[,group]==group_list[j]),mystr]),na.rm = T),digits = 2, format = "f")),' ± ',as.character(formatC(sd(as.numeric(df2[which(df2[,group]==group_list[j]),mystr]),na.rm = T),digits = 2, format = "f")),sep = "")
    53. df_result[i+1,'Median']=paste(as.character(formatC(median(as.numeric(df2[,mystr]),na.rm = T),digits = 2, format = "f")),' (',as.character(formatC(quantile(as.numeric(df2[,mystr]),na.rm = T)[2],digits = 2, format = "f")),', ',as.character(formatC(quantile(as.numeric(df2[,mystr]),na.rm = T)[4],digits = 2, format = "f")),') ',sep = "")
    54. df_result[i+1,paste(group_list[j],'Median')]=paste(as.character(formatC(median(as.numeric(df2[which(df2[,group]==group_list[j]),mystr]),na.rm = T),digits = 2, format = "f")),' (',as.character(formatC(quantile(as.numeric(df2[which(df2[,group]==group_list[j]),mystr]),na.rm = T)[2],digits = 2, format = "f")),', ',as.character(formatC(quantile(as.numeric(df2[which(df2[,group]==group_list[j]),mystr]),na.rm = T)[4],digits = 2, format = "f")),') ',sep = "")
    55. }
    56. #是否正态,样本量>5000时不能用shapiro 检验
    57. df_result[i+1,'是否正态']=ifelse(ad.test(as.numeric(df2[,mystr]))$p.value>0.05,'是','否')
    58. #是否方差齐性
    59. df_result[i+1,'是否方差齐性']=ifelse(leveneTest(as.numeric(df2[,mystr])~df2[,group])[1,3]>0.05,'是','否')
    60. #anova p-value
    61. anova_p <- summary(aov(as.numeric(df2[,mystr])~df2[,group]))[[1]]["Pr(>F)"][1,1]
    62. df_result[i+1,'anova p-value']<-ifelse(anova_p<0.000001,formatC(anova_p,format='e',digits=2),formatC(anova_p,digits = 6, format = "f"))
    63. #kruskal-wallis p-value
    64. kw_p <- kruskal.test(as.numeric(df2[,mystr])~df2[,group])$p.value
    65. df_result[i+1,'kruskal-wallis p-value']<-ifelse(kw_p<0.000001,formatC(kw_p,format='e',digits=2),formatC(kw_p,digits = 6, format = "f"))
    66. #t-test p-value
    67. if(t_test==TRUE)
    68. {
    69. variance_equal <- ifelse(df_result[i+1,'是否方差齐性']=='是',TRUE,FALSE) #根据方差是否齐性选择t检验函数参数
    70. t_p <- t.test(as.numeric(df2[,mystr])~df2[,group],var.equal=variance_equal)$p.value
    71. df_result[i+1,'t检验 p-value']<-ifelse(t_p<0.000001,formatC(t_p,format='e',digits=2),formatC(t_p,digits = 6, format = "f"))
    72. }
    73. }else
    74. {
    75. #计算freq和占比
    76. t<-as.data.frame(table(df2[,mystr]))
    77. df_result[i+1,'Overall']=paste(as.character(t[which(t[,1]==1),'Freq']),' (',as.character(formatC(t[which(t[,1]==1),'Freq']/dim(df)[1]*100,digits = 2, format = "f")),'%)',sep = "")
    78. t<-as.data.frame(table(df2[,c(mystr,group)]))
    79. for(j in 1:n_group)
    80. {
    81. df_result[i+1,group_list[j]]=paste(as.character(t[which(t[,1]=='1' & t[,2]==group_list[j]),'Freq']),' (',
    82. as.character(formatC(t[which(t[,1]=='1' & t[,2]==group_list[j]),'Freq']/each_group_num[which(each_group_num[,1]==group_list[j]),2]*100,digits = 2, format = "f")),
    83. '%)',sep='')
    84. }
    85. #卡方检验 p-value
    86. chisq_p <- chisq.test(as.matrix(table(df2[,c(mystr,group)])))$p.value
    87. df_result[i+1,'卡方检验 p-value']<-ifelse(chisq_p<0.000001,formatC(chisq_p,format='e',digits=2),formatC(chisq_p,digits =6, format = "f"))
    88. }
    89. }
    90. print("0.00e+00代表该数小于1e-323")
    91. return(df_result)
    92. }
    93. dev_yinshi <- dbGetQuery(con,"select * from dev_yinshi")
    94. data_v1_new<-dbGetQuery(con,"select * from data_v1_new")
    95. dev_yinshi<-dev_yinshi[which(!(is.na(dev_yinshi$id_980_v1) &
    96. is.na(dev_yinshi$id_982_v1) &
    97. is.na(dev_yinshi$id_981_v1) &
    98. is.na(dev_yinshi$id_983_v1) &
    99. is.na(dev_yinshi$id_984_v1) )),]
    100. dev_yinshi$group1<-NA
    101. dev_yinshi$group1[dev_yinshi$id_980_v1==0]<-0
    102. dev_yinshi$group1[dev_yinshi$id_980_v1==1]<-1
    103. dev_yinshi$group2<-NA
    104. dev_yinshi$group2[dev_yinshi$id_982_v1==98202]<-1
    105. dev_yinshi$group2[dev_yinshi$id_982_v1==98201 |
    106. dev_yinshi$id_982_v1==98203 ]<-0
    107. dev_yinshi$group3<-NA
    108. dev_yinshi$group3[dev_yinshi$id_981_v1==98101 |
    109. dev_yinshi$id_981_v1==98102 ]<-0
    110. dev_yinshi$group3[dev_yinshi$id_981_v1==98103 |
    111. dev_yinshi$id_981_v1==98104 |
    112. dev_yinshi$id_981_v1==98105 ]<-1
    113. dev_yinshi$group4<-NA
    114. dev_yinshi$group4[dev_yinshi$id_983_v1==98301]<-1
    115. dev_yinshi$group4[dev_yinshi$id_983_v1==98302 |
    116. dev_yinshi$id_983_v1==98303 |
    117. dev_yinshi$id_983_v1==98304 ]<-0
    118. dev_yinshi$group5<-NA
    119. dev_yinshi$group5[dev_yinshi$id_984_v1==98401 |
    120. dev_yinshi$id_984_v1==98402 |
    121. dev_yinshi$id_984_v1==98403 ]<-1
    122. dev_yinshi$group5[dev_yinshi$id_984_v1==98404]<-0
    123. dev_yinshi$score<-ifelse( ifelse(is.na(dev_yinshi$group1),0,dev_yinshi$group1)==1,1,0)+
    124. ifelse( ifelse(is.na(dev_yinshi$group2),0,dev_yinshi$group2)==1,1,0)+
    125. ifelse( ifelse(is.na(dev_yinshi$group3),0,dev_yinshi$group3)==1,1,0)+
    126. ifelse( ifelse(is.na(dev_yinshi$group4),0,dev_yinshi$group4)==1,1,0)+
    127. ifelse( ifelse(is.na(dev_yinshi$group5),0,dev_yinshi$group5)==1,1,0)
    128. dev_yinshi$score[is.na(dev_yinshi$group1) & is.na(dev_yinshi$group2) & is.na(dev_yinshi$group3) &
    129. is.na(dev_yinshi$group4) & is.na(dev_yinshi$group5) ]<-NA
    130. dev_yinshi$score_2<-ifelse( ifelse(is.na(dev_yinshi$group1),0,dev_yinshi$group1)==1,1,0)+
    131. ifelse( ifelse(is.na(dev_yinshi$group2),0,dev_yinshi$group2)==1,1,0)+
    132. ifelse( ifelse(is.na(dev_yinshi$group3),0,dev_yinshi$group3)==1,1,0)+
    133. ifelse( ifelse(is.na(dev_yinshi$group4),0,dev_yinshi$group4)==1,1,0)+
    134. ifelse( ifelse(is.na(dev_yinshi$group5),0,dev_yinshi$group5)==1,1,0)
    135. dev_yinshi$score_2[is.na(dev_yinshi$group1) | is.na(dev_yinshi$group2) | is.na(dev_yinshi$group3) |
    136. is.na(dev_yinshi$group4) | is.na(dev_yinshi$group5) ]<-NA
    137. dev_yinshi<-dev_yinshi[which(!is.na(dev_yinshi$score_2)),]
    138. ###id_979
    139. data_1<-merge(dev_yinshi,data_v1_new,by='user_id')
    140. data_1$db_duration_year<-as.numeric(data_1$diabetic_duration)/12
    141. data_1$ideal_smoking<-NA
    142. data_1$ideal_smoking[which(data_1$smoke_info==3)]<-as.character(1)
    143. data_1$ideal_smoking[which(data_1$smoke_info==1 | data_1$smoke_info==2)]<-as.character(0)
    144. data_1$current_drinker<-NA
    145. data_1$current_drinker[which(data_1$drink_info==3)]<-as.character(1)
    146. data_1$current_drinker[which(data_1$drink_info==1 | data_1$drink_info==2)]<-as.character(0)
    147. data_1$avg_pwv <- NA
    148. data_1$val_929003[which(data_1$flag_929003!=0)]<-NA
    149. data_1$val_929004[which(data_1$flag_929004!=0)]<-NA
    150. for (i in 1:nrow(data_1)) {
    151. if(is.na(data_1$val_929003[i]) & !is.na(data_1$val_929004[i])) {
    152. data_1$avg_pwv[i]<-as.numeric(data_1$val_929004[i])
    153. }
    154. if(!is.na(data_1$val_929003[i]) & is.na(data_1$val_929004[i])) {
    155. data_1$avg_pwv[i]<-as.numeric(data_1$val_929003[i])
    156. }
    157. if (!is.na(data_1$val_929003[i]) & !is.na(data_1$val_929004[i])) {
    158. data_1$avg_pwv[i]<-(as.numeric(data_1$val_929004[i])+as.numeric(data_1$val_929003[i]))*0.5
    159. }
    160. }
    161. data_1$ABI <- NA
    162. data_1$val_929001[which(data_1$flag_929001!=0)]<-NA
    163. data_1$val_929002[which(data_1$flag_929002!=0)]<-NA
    164. for (i in 1:nrow(data_1)) {
    165. if(is.na(data_1$val_929001[i]) & !is.na(data_1$val_929002[i])) {
    166. data_1$ABI[i]<-as.numeric(data_1$val_929002[i])
    167. }
    168. if(!is.na(data_1$val_929001[i]) & is.na(data_1$val_929002[i])) {
    169. data_1$ABI[i]<-as.numeric(data_1$val_929001[i])
    170. }
    171. if (!is.na(data_1$val_929001[i]) & !is.na(data_1$val_929002[i])) {
    172. data_1$ABI[i]<-(as.numeric(data_1$val_929002[i])+as.numeric(data_1$val_929001[i]))*0.5
    173. }
    174. }
    175. data_1$IMT <- NA
    176. data_1$val_925001[which(data_1$flag_925001!=0)]<-NA
    177. data_1$val_925009[which(data_1$flag_925009!=0)]<-NA
    178. for (i in 1:nrow(data_1)) {
    179. if(is.na(data_1$val_925001[i]) & !is.na(data_1$val_925009[i])) {
    180. data_1$IMT[i]<-as.numeric(data_1$val_925009[i])
    181. }
    182. if(!is.na(data_1$val_925001[i]) & is.na(data_1$val_925009[i])) {
    183. data_1$IMT[i]<-as.numeric(data_1$val_925001[i])
    184. }
    185. if (!is.na(data_1$val_925001[i]) & !is.na(data_1$val_925009[i])) {
    186. data_1$IMT[i]<-(as.numeric(data_1$val_925009[i])+as.numeric(data_1$val_925001[i]))*0.5
    187. }
    188. }
    189. data_1$DR<-NA
    190. data_1[which(data_1$manual_read_res==0),'DR']<-0
    191. data_1[which(data_1$manual_read_res>=1),'DR']<-1
    192. group<-'id_979'
    193. variable_list<-c('age','sex','education_level',
    194. 'diabetes_history','id_477','id_478','db_duration_year',
    195. 'ideal_smoking','current_drinker','bmi','weight','vat',
    196. 'waistline','hips','whr','sbp','dbp',
    197. 'val_875001','val_875011','homa2','val_875004','val_875014',
    198. 'val_876','val_878040','val_878041','val_878042','val_901013',
    199. 'avg_pwv','ABI','IMT','DR','group2',
    200. 'group3','group5','group4')
    201. type_list<-c('连续','离散','离散',
    202. '离散','离散','离散','连续',
    203. '离散','离散','连续','连续','连续',
    204. '连续','连续','连续','连续','连续',
    205. '连续','连续','连续','连续','连续',
    206. '连续','连续','连续','连续','连续',
    207. '连续','连续','连续','离散','离散',
    208. '离散','离散','离散')
    209. df_result<-analysis(data_1,variable_list,type_list,group,group_list=0,t_test=FALSE)
    210. df_result$p<-ifelse(!is.na(df_result$`anova p-value`),df_result$`anova p-value`,df_result$`卡方检验 p-value`)
    211. tem1<-c('Variables','Overall','1','2','3','4','p')
    212. df_result_2<-df_result[,tem1]