在使用kallisto等基于cDNA数据库注释软件时,我们获得的gene expression matrix是transcript id,这需要进行id转换。
加载R包
# devtools::install_github("BUStools/BUSpaRse")library(BUSpaRse)
获取数据
国内访问Ensembl较为困难,内置biomart包可以设置镜像,有时间可以修改一下。
tr2g <- transcript2gene(c("Homo sapiens", "Mus musculus"),type = "vertebrate",ensembl_version = 100,kallisto_out_path = "./")

biomart方法获取
上述方法也是基于biomart包访问Ensembl接口,直接用biomart可能更好。
获取基因组的基因list
cat Homo_sapiens.GRCh38.101.gtf | awk -F'\t' '{if($3=="gene") {split($9,a,";"); print a[1]"\t"$5-$4};}' | sed 's/[gene_id |"|]//g' | sort -u > Homo_sapiens.GRCh38.101.genelength.tsv
library(biomaRt)library(curl)genelist <- read.table("Homo_sapiens.GRCh38.101.genelength.tsv", header = T)human_mart <- useMart(host="www.ensembl.org",biomart="ENSEMBL_MART_ENSEMBL",dataset = "hsapiens_gene_ensembl")human_gene_all <- getBM(attributes=c("ensembl_gene_id","entrezgene_id","external_gene_name","ensembl_transcript_id","ensembl_transcript_id_version","transcript_biotype","description"),filters="ensembl_gene_id",values = genelist$Geneid,mart=human_mart)
