获取更多R语言知识,请关注公众号:医学和生信笔记
医学和生信笔记 公众号主要分享:1.医学小知识、肛肠科小知识;2.R语言和Python相关的数据分析、可视化、机器学习等;3.生物信息学学习资料和自己的学习笔记!
从TCGA下载的counts,FPKM和miRNA是很多单个的文件,需要手动合成一个完整的表达矩阵,这是一个高频操作,因此写一段代码简化流程,部分代码来自生信技能树和生信星球。
把以下代码保存为“get_TCGA_expr.R”,下次直接 source(get_TCGA_expr.R),就可以使用定义好的3个函数。
get_TCGAcounts
get_TCGAfpkm
get_TCGAmirna
## 需要提供counts文件所在的文件夹目录和metadata路径
## 类似于 countsfiles <- "E:/TCGA_COADREAD/mirna/"
## metadata <- "E:/TCGA_COADREAD/mirna/metadata.cart.2021-05-31.json"
# 获取counts表达矩阵,
get_TCGAcounts <- function(rnaseqpath, metadatapath){
allfiles <- list.files(path = rnaseqpath ,pattern='*.counts.gz$',recursive=T)
mRNA_seq <- lapply(allfiles, function(x){
result <- data.table::fread(file = file.path(rnaseqpath,x),data.table = F)
return( result )
})
mRNA_df <- Reduce(cbind, mRNA_seq)
tem <- nrow(mRNA_df)-5
mRNA_df <- mRNA_df[1:tem, c(1,seq(2,ncol(mRNA_df),2))]
save(mRNA_df,file = './TCGA_RNA.Rdata')
#读取metadata信息
meta <- jsonlite::fromJSON(metadatapath)
entity <- meta$associated_entities
tcga_id <- unlist(lapply(entity,function(x){x["entity_submitter_id"]}))
file_name <- meta$file_name
fln2id <- data.frame(filename = file_name, tcgaid = tcga_id)
#把样本名(tcga_patient_id)变成表达矩阵的列名
mis2 <- stringr::str_split(allfiles,"/",simplify = T)[,2]
col_tcga <- fln2id[match(mis2,fln2id$filename),]
colnames(mRNA_df)[2:ncol(mRNA_df)] <- as.character(col_tcga$tcgaid)
rnaExpr <<- mRNA_df
save(rnaExpr,file = "./TCGA_new_RNA.Rdata")
print(paste("your RNA exprset has been saved at: ", getwd(), sep = ""))
}
# 获取FPKM表达矩阵
get_TCGAfpkm <- function(rnaseqpath, metadatapath){
allfiles <- list.files(path = rnaseqpath ,pattern='*.FPKM.txt.gz$',recursive=T)
mRNA_seq <- lapply(allfiles, function(x){
result <- data.table::fread(file = file.path(rnaseqpath,x),data.table = F)
return( result )
})
mRNA_df <- Reduce(cbind, mRNA_seq)
tem <- nrow(mRNA_df)-5
mRNA_df <- mRNA_df[1:tem, c(1,seq(2,ncol(mRNA_df),2))]
save(mRNA_df,file = './TCGA_RNA.Rdata')
#读取metadata信息
meta <- jsonlite::fromJSON(metadatapath)
entity <- meta$associated_entities
tcga_id <- unlist(lapply(entity,function(x){x["entity_submitter_id"]}))
file_name <- meta$file_name
fln2id <- data.frame(filename = file_name, tcgaid = tcga_id)
#把样本名(tcga_patient_id)变成表达矩阵的列名
mis2 <- stringr::str_split(allfiles,"/",simplify = T)[,2]
col_tcga <- fln2id[match(mis2,fln2id$filename),]
colnames(mRNA_df)[2:ncol(mRNA_df)] <- as.character(col_tcga$tcgaid)
rnaExpr <<- mRNA_df
save(rnaExpr,file = "./TCGA_new_RNA.Rdata")
print(paste("your RNA exprset has been saved at: ", getwd(), sep = ""))
}
# 获取mirna表达矩阵
get_TCGAmirna <- function(rnaseqpath, metadatapath){
allfiles <- list.files(path = rnaseqpath ,pattern='*.mirnas.quantification.txt$',recursive=T)
mRNA_seq <- lapply(allfiles, function(x){
result <- data.table::fread(file = file.path(rnaseqpath,x),data.table = F)
return( result )
})
mRNA_df <- Reduce(cbind, mRNA_seq)
mRNA_df <- mRNA_df[, c(1,seq(2,ncol(mRNA_df),4))]
save(mRNA_df,file = './TCGA_miRNA.Rdata')
#读取metadata信息
meta <- jsonlite::fromJSON(metadatapath)
entity <- meta$associated_entities
tcga_id <- unlist(lapply(entity,function(x){x["entity_submitter_id"]}))
file_name <- meta$file_name
fln2id <- data.frame(filename = file_name, tcgaid = tcga_id)
#把样本名(tcga_patient_id)变成表达矩阵的列名
mis2 <- stringr::str_split(allfiles,"/",simplify = T)[,2]
col_tcga <- fln2id[match(mis2,fln2id$filename),]
colnames(mRNA_df)[2:ncol(mRNA_df)] <- as.character(col_tcga$tcgaid)
rnaExpr <<- mRNA_df
save(rnaExpr,file = "./TCGA_miRNAexpr.Rdata")
print(paste("your RNA exprset has been saved at: ", getwd(), sep = ""))
获取更多R语言知识,请关注公众号:医学和生信笔记
医学和生信笔记 公众号主要分享:1.医学小知识、肛肠科小知识;2.R语言和Python相关的数据分析、可视化、机器学习等;3.生物信息学学习资料和自己的学习笔记!