event需要是0、1

生存分析前的数据整理

表达矩阵只需要tumor数据，不要normal，将其去掉，新表达矩阵数据命名为exprSet；
临床信息需要进一步整理，成为生存分析需要的格式，新临床信息数据命名为meta。
rm(list=ls())
proj = "TCGA-KIRC"
load(paste0(proj,".Rdata"))
library(stringr)
1.整理表达矩阵
1.1去除normal样本
table(Group)
#> Group
#> normal  tumor 
#>     72    535
exprSet = exp[,Group=='tumor']
ncol(exp)
#> [1] 607
ncol(exprSet)
#> [1] 535
1.2 基因过滤
再次进行基因过滤.
(1)标准1：至少要在50%的样本里表达量大于0（最低标准）。
exp(600样本)满足“至少在300个样本里表达量>0”,不能等同于 exprSet(500样本)满足“至少在250个样本里表达量>0”
k = apply(exprSet,1, function(x){sum(x>0)>0.5*ncol(exprSet)});table(k)
#> k
#> FALSE  TRUE 
#>   155 31798
exprSet = exprSet[k,]
nrow(exprSet)
#> [1] 31798
(2)标准2：至少在一半以上样本里表达量＞10(其他数字也可，酌情调整)
k = apply(exprSet,1, function(x){sum(x>10)>0.5*ncol(exprSet)});table(k)
#> k
#> FALSE  TRUE 
#> 11718 20080
exprSet = exprSet[k,]
nrow(exprSet)
#> [1] 20080
1.3 使用logCPM或logTPM数据
exprSet[1:4,1:4]
#>            TCGA-BP-4342-01A TCGA-A3-3387-01A TCGA-BP-4167-01A TCGA-B8-4620-01A
#> WASH7P                   77               69               23               29
#> AL627309.6              282               36               26               88
#> WASH9P                  222              147               74               47
#> AP006222.1               77               23               23               14
exprSet=log2(edgeR::cpm(exprSet)+1)
exprSet[1:4,1:4]
#>            TCGA-BP-4342-01A TCGA-A3-3387-01A TCGA-BP-4167-01A TCGA-B8-4620-01A
#> WASH7P             1.049412        1.0031558        0.5304210        0.5510009
#> AL627309.6         2.297940        0.6078858        0.5871819        1.2698370
#> WASH9P             2.029993        1.6506568        1.2807483        0.8104724
#> AP006222.1         1.049412        0.4166163        0.5304210        0.2922303
1.4 样本与病人
有的病人会有两个或两个以上的肿瘤样本，就有重复。两种可行的办法：
（1）以病人为中心，对表达矩阵的列按照病人ID去重复，每个病人只保留一个样本。
exprSet = exprSet[,sort(colnames(exprSet))]
k = !duplicated(str_sub(colnames(exprSet),1,12));table(k)
#> k
#> FALSE  TRUE 
#>     5   530
exprSet = exprSet[,k] 
ncol(exprSet)
#> [1] 530
（2）以样本为中心，如果每个病人有多个样本则全部保留。(删掉上面这一段代码即可)
2.整理生存信息和临床信息
2.1数据合并
xena单独整理了生存信息，和临床信息是分开存放的。后续构建模型可能会需要纳入一些临床信息，所以要合并到一起。
library(dplyr)
nrow(surv)
#> [1] 979
nrow(clinical)
#> [1] 985
meta = left_join(surv,clinical,by = c("sample"= "submitter_id.samples"))
nrow(surv)
#> [1] 979
2.2 样本过滤
去掉生存信息不全或者生存时间小于30天的样本，样本纳排标准不唯一，且差别很大
k1 = meta$OS.time >= 30;table(k1)
#> k1
#> FALSE  TRUE 
#>    20   959
k2 = !(is.na(meta$OS.time)|is.na(meta$OS));table(k2)
#> k2
#> TRUE 
#>  979
meta = meta[k1&k2,]
2.3 选列、简化列名
小技巧：搜索列名
tmp = data.frame(colnames(meta))
#View(tmp)
选择需要的列，简化列名
meta = meta[,c(
  'sample',
  'OS',
  'OS.time',
  'race.demographic',
  'age_at_initial_pathologic_diagnosis',
  'gender.demographic' ,
  'tumor_stage.diagnoses'
)]
colnames(meta)=c('ID','event','time','race','age','gender','stage')
str(meta)
#> 'data.frame':    959 obs. of  7 variables:
#>  $ ID    : chr  "TCGA-B8-5162-01A" "TCGA-B4-5834-01A" "TCGA-CW-6087-01A" "TCGA-CW-6087-11A" ...
#>  $ event : int  0 0 1 1 1 1 1 1 0 1 ...
#>  $ time  : int  36 38 41 41 42 42 43 43 43 51 ...
#>  $ race  : chr  "white" "white" "white" "white" ...
#>  $ age   : int  62 59 61 61 75 75 65 65 35 72 ...
#>  $ gender: chr  "male" "male" "male" "male" ...
#>  $ stage : chr  "stage ii" "stage i" "stage iv" "stage iv" ...
2.4 简化、规范内容
结局事件
生存分析的输入数据里，要求结局事件必须用0和1表示，1表示阳性结局。
xena的数据是整理好的，其他来源的需要自行检查和整理。
table(meta$event)
#> 
#>   0   1 
#> 622 337
#自行整理的例子：
a = c("Dead", "Dead", "Alive", "Dead", "Alive")
ifelse(a=="Alive",0,1)
#> [1] 1 1 0 1 0
生存时间
认清生存时间的单位（通常是月，也可以用天和年）;
range(meta$time)
#> [1]   36 4537
meta$time = meta$time/30
range(meta$time)
#> [1]   1.2000 151.2333
(3)检查各列的信息是否规范，没有冗余信息。缺失的信息用NA代替
table(meta$stage)
#> 
#> not reported      stage i     stage ii    stage iii     stage iv 
#>            4          467          101          226          161
meta$stage = meta$stage %>% 
  str_replace("not reported",as.character(NA)) %>% 
  str_remove("stage ") %>% 
  str_to_upper()
table(meta$stage,useNA = "always")
#> 
#>    I   II  III   IV <NA> 
#>  467  101  226  161    4
table(meta$race)
#> 
#>                     asian black or african american              not reported 
#>                        16                        69                        14 
#>                     white 
#>                       860
meta$race = meta$race %>% 
  str_replace("not reported",as.character(NA))
table(meta$race,useNA = "always")
#> 
#>                     asian black or african american                     white 
#>                        16                        69                       860 
#>                      <NA> 
#>                        14
table(meta$gender,useNA = "always")
#> 
#> female   male   <NA> 
#>    327    632      0
str(meta)
#> 'data.frame':    959 obs. of  7 variables:
#>  $ ID    : chr  "TCGA-B8-5162-01A" "TCGA-B4-5834-01A" "TCGA-CW-6087-01A" "TCGA-CW-6087-11A" ...
#>  $ event : int  0 0 1 1 1 1 1 1 0 1 ...
#>  $ time  : num  1.2 1.27 1.37 1.37 1.4 ...
#>  $ race  : chr  "white" "white" "white" "white" ...
#>  $ age   : int  62 59 61 61 75 75 65 65 35 72 ...
#>  $ gender: chr  "male" "male" "male" "male" ...
#>  $ stage : chr  "II" "I" "IV" "IV" ...
3.实现表达矩阵与临床信息的匹配
即：meta的每一行与exprSet每一列一一对应
rownames(meta) <- meta$ID
s = intersect(rownames(meta),colnames(exprSet));length(s)
#> [1] 513
exprSet = exprSet[,s]
meta = meta[s,]
dim(exprSet)
#> [1] 20080   513
dim(meta)
#> [1] 513   7
identical(rownames(meta),colnames(exprSet))
#> [1] TRUE
save(meta,exprSet,proj,file = paste0(proj,"_sur_model.Rdata")) #将_sur_model.Rdata弄好

实操

1.准备输入数据
rm(list = ls())
proj = "TCGA-KIRC"
load(paste0(proj,"_sur_model.Rdata"))
ls()
#> [1] "exprSet" "meta"    "proj"
exprSet[1:4,1:4]
#>            TCGA-B8-5162-01A TCGA-B4-5834-01A TCGA-CW-6087-01A TCGA-B0-4698-01A
#> WASH7P           0.38330266        1.0823971       0.24141414        0.8200756
#> AL627309.6       0.04121953        0.1330091       0.60612548        1.7747381
#> WASH9P           0.95751073        1.8118992       0.66255436        1.7390920
#> AP006222.1       0.15825052        0.2713707       0.08503973        0.5589702
str(meta)
#> 'data.frame':    513 obs. of  7 variables:
#>  $ ID    : chr  "TCGA-B8-5162-01A" "TCGA-B4-5834-01A" "TCGA-CW-6087-01A" "TCGA-B0-4698-01A" ...
#>  $ event : int  0 0 1 1 1 0 1 0 1 0 ...
#>  $ time  : num  1.2 1.27 1.37 1.4 1.43 ...
#>  $ race  : chr  "white" "white" "white" "white" ...
#>  $ age   : int  62 59 61 75 65 35 72 50 84 76 ...
#>  $ gender: chr  "male" "male" "male" "male" ...
#>  $ stage : chr  "II" "I" "IV" "IV" ...
2.KM-plot
简单版本和进阶版本
library(survival)
library(survminer)
sfit <- survfit(Surv(time, event)~gender, data=meta)
ggsurvplot(sfit,pval=TRUE)
ggsurvplot(sfit,
           palette = "jco",
           risk.table =TRUE,
           pval =TRUE,
           conf.int =TRUE)
连续型信息怎么作KM分析？例如年龄,基因？
连续型数据的离散化
年龄
group = ifelse(meta$age>median(meta$age,na.rm = T),"older","younger")
table(group)
#> group
#>   older younger 
#>     251     262
sfit=survfit(Surv(time, event)~group, data=meta)
ggsurvplot(sfit,pval =TRUE, data = meta, risk.table = TRUE)
基因
g = rownames(exprSet)[1];g
#> [1] "WASH7P"
meta$gene = ifelse(exprSet[g,]> median(exprSet[g,]),'high','low')
sfit=survfit(Surv(time, event)~gene, data=meta)
ggsurvplot(sfit,pval =TRUE, data = meta, risk.table = TRUE)

2.log-rank test
KM的p值是log-rank test得出的，可批量操作
source("KM_cox_function.R")
logrankfile = paste0(proj,"_log_rank_p.Rdata")
if(!file.exists(logrankfile)){
  log_rank_p <- apply(exprSet , 1 , geneKM)
  log_rank_p=sort(log_rank_p)
  save(log_rank_p,file = logrankfile)
}
load(logrankfile)
table(log_rank_p<0.01) 
#> 
#> FALSE  TRUE 
#> 13023  7057
table(log_rank_p<0.05) 
#> 
#> FALSE  TRUE 
#> 10232  9848
3.批量单因素cox
coxfile = paste0(proj,"_cox.Rdata")
if(!file.exists(coxfile)){
  cox_results <-apply(exprSet , 1 , genecox)
  cox_results=as.data.frame(t(cox_results))
  save(cox_results,file = coxfile)
}
load(coxfile)
table(cox_results$p<0.01)
#> 
#> FALSE  TRUE 
#> 11911  8169
table(cox_results$p<0.05)
#> 
#> FALSE  TRUE 
#>  9373 10707
lr = names(log_rank_p)[log_rank_p<0.01];length(lr)
#> [1] 7057
cox = rownames(cox_results)[cox_results$p<0.01];length(cox)
#> [1] 8169
length(intersect(lr,cox))
#> [1] 6165
save(lr,cox,file = paste0(proj,"_logrank_cox_gene.Rdata"))

KM-cox

geneKM = function(gene){
  meta$group=ifelse(gene>median(gene),'high','low')  
  data.survdiff=survdiff(Surv(time, event)~group,data=meta)
  p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
  return(p.val)
}
genecox = function(gene){
  meta$gene = gene
  #可直接使用连续型变量
  m = coxph(Surv(time, event) ~ gene, data =  meta)
  #也可使用二分类变量
  #meta$group=ifelse(gene>median(gene),'high','low') 
  #meta$group = factor(meta$group,levels = c("low","high"))
  #m=coxph(Surv(time, event) ~ group, data =  meta)
  beta <- coef(m)
  se <- sqrt(diag(vcov(m)))
  HR <- exp(beta)
  HRse <- HR * se
  #summary(m)
  tmp <- round(cbind(coef = beta, 
                     se = se, z = beta/se, 
                     p = 1 - pchisq((beta/se)^2, 1),
                     HR = HR, HRse = HRse,
                     HRz = (HR - 1) / HRse, 
                     HRp = 1 - pchisq(((HR - 1)/HRse)^2, 1),
                     HRCILL = exp(beta - qnorm(.975, 0, 1) * se),
                     HRCIUL = exp(beta + qnorm(.975, 0, 1) * se)), 3)
  return(tmp['gene',]) 
  #return(tmp['grouphigh',])#二分类变量
}

生存模型

Lasso回归

作用：①将重要的基因筛选出来，
②可得出公式，得出参与基因的系数，代入后可得出风险值

训练集和测试集不同

1.准备输入数据
rm(list = ls())
proj = "TCGA-KIRC"
load(paste0(proj,"_sur_model.Rdata"))
ls()
#> [1] "exprSet" "meta"    "proj"
exprSet[1:4,1:4]
#>            TCGA-B8-5162-01A TCGA-B4-5834-01A TCGA-CW-6087-01A TCGA-B0-4698-01A
#> WASH7P           0.38330266        1.0823971       0.24141414        0.8200756
#> AL627309.6       0.04121953        0.1330091       0.60612548        1.7747381
#> WASH9P           0.95751073        1.8118992       0.66255436        1.7390920
#> AP006222.1       0.15825052        0.2713707       0.08503973        0.5589702
str(meta)
#> 'data.frame':    513 obs. of  7 variables:
#>  $ ID    : chr  "TCGA-B8-5162-01A" "TCGA-B4-5834-01A" "TCGA-CW-6087-01A" "TCGA-B0-4698-01A" ...
#>  $ event : int  0 0 1 1 1 0 1 0 1 0 ...
#>  $ time  : num  1.2 1.27 1.37 1.4 1.43 ...
#>  $ race  : chr  "white" "white" "white" "white" ...
#>  $ age   : int  62 59 61 75 65 35 72 50 84 76 ...
#>  $ gender: chr  "male" "male" "male" "male" ...
#>  $ stage : chr  "II" "I" "IV" "IV" ...
load(paste0(proj,"_logrank_cox_gene.Rdata"))
g = cox[1:1000] #这里是示例，取了1000个单因素cox筛选的基因，活学活用
exprSet = exprSet[g,]
2.构建lasso回归模型
输入数据是表达矩阵(仅含tumor样本)和每个病人对应的生死（顺序必须一致）。
x=t(exprSet)
y=meta$event
library(glmnet)
#> Warning: package 'glmnet' was built under R version 4.1.3
#> Warning: package 'Matrix' was built under R version 4.1.3
2.1挑选合适的λ值
Lambda 是构建模型的重要参数。他的大小关系着模型选择的基因个数
#调优参数
set.seed(1006)
cv_fit <- cv.glmnet(x=x, y=y)
plot(cv_fit)
#系数图
fit <- glmnet(x=x, y=y)
plot(fit,xvar = "lambda")
两条虚线分别指示了两个特殊的λ值,一个是lambda.min,一个是lambda.1se,这两个值之间的lambda都认为是合适的。
lambda.1se构建的模型最简单，即使用的基因数量少，而lambda.min则准确率更高一点，使用的基因数量更多一点。
2.2 用这两个λ值重新建模
model_lasso_min <- glmnet(x=x, y=y,lambda=cv_fit$lambda.min)
model_lasso_1se <- glmnet(x=x, y=y,lambda=cv_fit$lambda.1se)
选中的基因与系数存放于模型的子集beta中，用到的基因有一个s0值，没用的基因只记录了“.”，所以可以用下面代码挑出用到的基因。
head(model_lasso_min$beta,20)
choose_gene_min=rownames(model_lasso_min$beta)[as.numeric(model_lasso_min$beta)!=0]
choose_gene_1se=rownames(model_lasso_1se$beta)[as.numeric(model_lasso_1se$beta)!=0]
length(choose_gene_min)
#> [1] 41
length(choose_gene_1se)
#> [1] 33
save(choose_gene_min,file = paste0(proj,"_lasso_choose_gene_min.Rdata"))
save(choose_gene_1se,file = paste0(proj,"_lasso_choose_gene_1se.Rdata"))
3.模型预测和评估
newx参数是预测对象。输出结果lasso.prob是一个矩阵，第一列是min的预测结果，第二列是1se的预测结果，预测结果是概率，或者说百分比，不是绝对的0和1。
将每个样本的生死和预测结果放在一起，直接cbind即可。
lasso.prob <- predict(cv_fit, newx=x , s=c(cv_fit$lambda.min,cv_fit$lambda.1se) )
head(lasso.prob)
#>                           s1          s2
#> TCGA-B8-5162-01A  0.35441206  0.37673630
#> TCGA-B4-5834-01A  0.06975962  0.07579627
#> TCGA-CW-6087-01A  0.72387660  0.64176153
#> TCGA-B0-4698-01A  0.63402199  0.54280316
#> TCGA-B0-4690-01A  0.83592571  0.75407756
#> TCGA-AS-3778-01A -0.13649093 -0.02911398
ROC曲线
library(pROC)
library(ggplot2)
m <- roc(meta$event,lasso.prob[,1])
g <- ggroc(m,legacy.axes = T,size = 1,color = "#2fa1dd")
auc(m)
#> Area under the curve: 0.8687
g + theme_bw() +
  geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1), 
               colour = "grey", linetype = "dashed")+
  annotate("text",x = .75, y = .25,
           label = paste("AUC of min = ",format(round(as.numeric(auc(m)),2),nsmall = 2)),color = "#2fa1dd")
计算AUC取值范围在0.5-1之间，越接近于1越好。可以根据预测结果绘制ROC曲线。
两个模型的曲线画在一起
m2 <- roc(meta$event, lasso.prob[,2])
auc(m2)
#> Area under the curve: 0.8507
g <- ggroc(list(min = m,se = m2),legacy.axes = T,size = 1)
g + theme_bw() +
  scale_color_manual(values = c("#2fa1dd", "#f87669"))+
  geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1), 
               colour = "grey", linetype = "dashed")+
  annotate("text",x = .75, y = .25,
           label = paste("AUC of min = ",format(round(as.numeric(auc(m)),2),nsmall = 2)),color = "#2fa1dd")+
  annotate("text",x = .75, y = .15,
           label = paste("AUC of 1se = ",format(round(as.numeric(auc(m2)),2),nsmall = 2)),color = "#f87669")
5.切割数据构建模型并预测
5.1 切割数据（数据>500）
用R包caret切割数据，生成的结果是一组代表列数的数字，用这些数字来给表达矩阵和meta取子集即可。
library(caret)
#> Warning: package 'caret' was built under R version 4.1.3
set.seed(12345679)
sam<- createDataPartition(meta$event, p = .5,list = FALSE)
head(sam)
#>      Resample1
#> [1,]         5
#> [2,]         9
#> [3,]        13
#> [4,]        17
#> [5,]        19
#> [6,]        21
可查看两组一些临床参数切割比例
train <- exprSet[,sam]
test <- exprSet[,-sam]
train_meta <- meta[sam,]
test_meta <- meta[-sam,]
prop.table(table(train_meta$stage))
#> 
#>         I        II       III        IV 
#> 0.5058824 0.1019608 0.2117647 0.1803922
prop.table(table(test_meta$stage)) 
#> 
#>         I        II       III        IV 
#> 0.4941176 0.1176471 0.2470588 0.1411765
prop.table(table(test_meta$race)) 
#> 
#>                     asian black or african american                     white 
#>                0.01587302                0.09920635                0.88492063
prop.table(table(train_meta$race)) 
#> 
#>                     asian black or african american                     white 
#>                0.01574803                0.10236220                0.88188976
5.2 切割后的train数据集建模
和上面的建模方法一样。
#计算lambda
x = t(train)
y = train_meta$event
cv_fit <- cv.glmnet(x=x, y=y)
plot(cv_fit)
#构建模型
model_lasso_min <- glmnet(x=x, y=y,lambda=cv_fit$lambda.min)
model_lasso_1se <- glmnet(x=x, y=y,lambda=cv_fit$lambda.1se)
#挑出基因
head(model_lasso_min$beta)
#> 6 x 1 sparse Matrix of class "dgCMatrix"
#>            s0
#> WASH7P      .
#> AL627309.6  .
#> WASH9P      .
#> MTATP6P1    .
#> LINC01409   .
#> LINC00115   .
choose_gene_min=rownames(model_lasso_min$beta)[as.numeric(model_lasso_min$beta)!=0]
choose_gene_1se=rownames(model_lasso_1se$beta)[as.numeric(model_lasso_1se$beta)!=0]
length(choose_gene_min)
#> [1] 31
length(choose_gene_1se)
#> [1] 21
4.模型预测
用训练集构建模型，预测测试集的生死，注意newx参数变了。
lasso.prob <- predict(cv_fit, newx=t(test), s=c(cv_fit$lambda.min,cv_fit$lambda.1se) )
head(lasso.prob)
#>                           s1         s2
#> TCGA-B8-5162-01A  0.39653904 0.43398261
#> TCGA-B4-5834-01A  0.10082314 0.16288716
#> TCGA-CW-6087-01A  0.69858720 0.63793147
#> TCGA-B0-4698-01A  0.52026571 0.54582571
#> TCGA-AS-3778-01A -0.02908933 0.07121892
#> TCGA-B2-4098-01A  0.38169229 0.35982935
再画ROC曲线
library(pROC)
library(ggplot2)
m <- roc(test_meta$event, lasso.prob[,1])
g <- ggroc(m,legacy.axes = T,size = 1,color = "#2fa1dd")
auc(m)
#> Area under the curve: 0.7696
g + theme_bw() +
  geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1), 
               colour = "grey", linetype = "dashed")+
  annotate("text",x = .75, y = .25,
           label = paste("AUC of min = ",format(round(as.numeric(auc(m)),2),nsmall = 2)),color = "#2fa1dd")
计算AUC取值范围在0.5-1之间，越接近于1越好。可以根据预测结果绘制ROC曲线。
两个模型的曲线画在一起
m2 <- roc(test_meta$event, lasso.prob[,2])
auc(m2)
#> Area under the curve: 0.7569
g <- ggroc(list(min = m,se = m2),legacy.axes = T,size = 1)
g + theme_bw() +
  scale_color_manual(values = c("#2fa1dd", "#f87669"))+
  geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1), 
               colour = "grey", linetype = "dashed")+
  annotate("text",x = .75, y = .25,
           label = paste("AUC of min = ",format(round(as.numeric(auc(m)),2),nsmall = 2)),color = "#2fa1dd")+
  annotate("text",x = .75, y = .15,
           label = paste("AUC of 1se = ",format(round(as.numeric(auc(m2)),2),nsmall = 2)),color = "#f87669")

用自己的数据进行逻辑回归- lasso分析
导入数据格式

rm(list = ls()) 
library(glmnet)
library(readxl)
library(plyr)
library(caret)
library(corrplot)
library(ggplot2)
library(Hmisc)
library(openxlsx)
data <- read.xlsx("/Users/mac/Desktop/r/data1.xlsx")
x <- as.matrix(data[,-c(1:2)])
y <- as.double(data$Group)
fit <- glmnet(x,y,family = "binomial",nlambda = 1000,alpha = 1)
print(fit)
plot(fit,xvar = "lambda")
#筛选最合适的变量
lasso_fit <- cv.glmnet(x,y,family="binomial",alpha = 1,type.measure = "auc",nlambda = 1000)
plot(lasso_fit)
print(lasso_fit)
#使用lambda.lse 获取的模型较好
lasso_best <- glmnet(x=x,y=y,alpha = 1,lambda = lasso_fit$lambda.1se)
coef(lasso_best)
coefficient <- coef(lasso_best,s=lasso_best$lambda.lse)
coe <- coefficient@x
coe <- as.data.frame(coe)
Active_Index <- which(as.numeric(coefficient)!=0)
active_coefficients <- as.numeric(coefficient)[Active_Index]
variable <- rownames(coefficient)[Active_Index]
variable <- as.data.frame(variable)
variable <-cbind(variable,coe)
#使用lambda.min 适用于样本质量较差，纳入模型中的变量较多
lasso_best <- glmnet(x=x,y=y,alpha = 1,lambda = lasso_fit$lambda.min)
coef(lasso_best)
coefficient <- coef(lasso_best,s=lasso_best$lambda.min)
coe <- coefficient@x
coe <- as.data.frame(coe)
Active_Index <- which(as.numeric(coefficient)!=0)
active_coefficients <- as.numeric(coefficient)[Active_Index]
variable <- rownames(coefficient)[Active_Index]
variable <- as.data.frame(variable)
variable <-cbind(variable,coe)

timeROC

cox-forest

1.准备输入数据
rm(list = ls())
proj = "TCGA-KIRC"
if(!require(My.stepwise))install.packages("My.stepwise")
#> Warning: package 'My.stepwise' was built under R version 4.1.3
load(paste0(proj,"_sur_model.Rdata"))
load(paste0(proj,"_lasso_choose_gene_1se.Rdata"))
g = choose_gene_1se
2.构建coxph模型
将用于建模的基因（例如lasso回归选中的基因）从表达矩阵中取出来，，可作为列添加在meta表噶的后面,组成的数据框赋值给dat。
library(stringr)
e=t(exprSet[g,])
colnames(e)= str_replace_all(colnames(e),"-","_")
dat=cbind(meta,e)
dat$gender=as.numeric(factor(dat$gender))
dat$stage=as.numeric(factor(dat$stage))
colnames(dat)
#>  [1] "ID"         "event"      "time"       "race"       "age"       
#>  [6] "gender"     "stage"      "AL391244.2" "AJAP1"      "AL357140.2"
#> [11] "CPLANE2"    "RCC2"       "AL137127.1" "NIPAL3"     "KDF1"      
#> [16] "LINC01389"  "ANGPTL3"    "GADD45A"    "LINC01725"  "LINC02609" 
#> [21] "CHI3L2"     "LINC01719"  "KCNN3"      "MSTO1"      "PMF1_BGLAP"
#> [26] "CRABP2"     "CD1E"       "FCER1A"     "NUF2"       "DUTP6"     
#> [31] "IGFN1"      "NEK2"       "PROX1"      "DUSP5P1"    "HS1BP3_IT1"
#> [36] "OTOF"       "CENPA"      "SLC5A6"     "PRKCE"      "SPTBN1"
逐步回归法构建最优模型
输出结果行数太多，所以我注释掉了
library(survival)
#> Warning: package 'survival' was built under R version 4.1.3
library(survminer)
# 不能允许缺失值
dat2 = na.omit(dat)
library(My.stepwise)
vl <- colnames(dat)[c(5:ncol(dat))]
# My.stepwise.coxph(Time = "time",
#                   Status = "event",
#                   variable.list = vl,
#                   data = dat2)
使用输出结果里的最后一个模型
model = coxph(formula = Surv(time, event) ~ stage + AL357140.2 + AL391244.2 + 
    age + OTOF + IGFN1 + CRABP2 + PMF1_BGLAP + SLC5A6 + PROX1 + 
    LINC01719 + DUTP6, data = dat2)
 3.模型可视化-森林图
ggforest(model,data = dat2)
4.模型预测
fp <- predict(model,newdata = dat2)
library(Hmisc)
#> Warning: package 'Hmisc' was built under R version 4.1.3
options(scipen=200)
with(dat2,rcorr.cens(fp,Surv(time, event))) #with函数下可将列名当作变量用
#>         C Index             Dxy            S.D.               n         missing 
#>      0.17850596（实为1- C Index 的值，故须换算）     -0.64298808      0.02857663    503.00000000      0.00000000 
#>      uncensored  Relevant Pairs      Concordant       Uncertain 
#>    166.00000000 105834.00000000  18892.00000000 146654.00000000
C-index用于计算生存分析中的COX模型预测值与真实之间的区分度（discrimination），也称为Harrell’s concordanceindex。C-index在0.5-1之间。0.5为完全不一致,说明该模型没有预测作用,1为完全一致,说明该模型预测结果与实际完全一致。
5.切割数据构建模型并预测
5.1 切割数据
用R包caret切割数据，生成的结果是一组代表列数的数字，用这些数字来给表达矩阵和meta取子集即可。
library(caret)
#> Warning: package 'caret' was built under R version 4.1.3
set.seed(12345679)
sam<- createDataPartition(meta$event, p = .5,list = FALSE)
train <- exprSet[,sam]
test <- exprSet[,-sam]
train_meta <- meta[sam,]
test_meta <- meta[-sam,]
5.2 切割后的train数据集建模
和上面的建模方法一样。
e=t(train[g,])
colnames(e)= str_replace_all(colnames(e),"-","_")
dat=cbind(train_meta,e)
dat$gender=as.numeric(factor(dat$gender))
dat$stage=as.numeric(factor(dat$stage))
colnames(dat)
#>  [1] "ID"         "event"      "time"       "race"       "age"       
#>  [6] "gender"     "stage"      "AL391244.2" "AJAP1"      "AL357140.2"
#> [11] "CPLANE2"    "RCC2"       "AL137127.1" "NIPAL3"     "KDF1"      
#> [16] "LINC01389"  "ANGPTL3"    "GADD45A"    "LINC01725"  "LINC02609" 
#> [21] "CHI3L2"     "LINC01719"  "KCNN3"      "MSTO1"      "PMF1_BGLAP"
#> [26] "CRABP2"     "CD1E"       "FCER1A"     "NUF2"       "DUTP6"     
#> [31] "IGFN1"      "NEK2"       "PROX1"      "DUSP5P1"    "HS1BP3_IT1"
#> [36] "OTOF"       "CENPA"      "SLC5A6"     "PRKCE"      "SPTBN1"
library(My.stepwise)
dat2 = na.omit(dat)
vl <- colnames(dat2)[c(5:ncol(dat2))]
# My.stepwise.coxph(Time = "time",
#                   Status = "event",
#                   variable.list = vl,
#                   data = dat2)
model = coxph(formula = Surv(time, event) ~ stage + AL357140.2 + IGFN1 + 
    AJAP1 + ANGPTL3 + SLC5A6 + HS1BP3_IT1 + AL137127.1 + LINC01719 + 
    CRABP2, data = dat2)
5.3 模型可视化
ggforest(model, data =dat2)
5.4 用切割后的数据test数据集验证模型
e=t(test[g,])
colnames(e)= str_replace_all(colnames(e),"-","_")
test_dat=cbind(test_meta,e)
test_dat$gender=as.numeric(factor(test_dat$gender))
test_dat$stage=as.numeric(factor(test_dat$stage))
fp <- predict(model,newdata = test_dat)
library(Hmisc)
with(test_dat,rcorr.cens(fp,Surv(time, event)))
#>        C Index            Dxy           S.D.              n        missing 
#>     0.22187105    -0.55625790     0.04736798   255.00000000     1.00000000 
#>     uncensored Relevant Pairs     Concordant      Uncertain 
#>    86.00000000 28476.00000000  6318.00000000 36294.00000000