一、实验分组
rm(list = ls())load(file = "step1output.Rdata")library(stringr)library(tinyarray)
1. 有现成的可以用来分组的列
Group = pd$`disease state:ch1`
2. 自己生成
Group = c(rep("RA",times=13),
rep("control",times=9))
Group = rep(c("RA","control"),times = c(13,9))
3. 匹配关键词,自行分类
Group=ifelse(str_detect(pd$source_name_ch1,"control"),
"control",
"RA")
4. 设置参考水平,指定levels,对照组在前,处理组在后
Group = factor(Group,
levels = c("control","RA"))
Group
二、探针注释的获取
1. 捷径
find_anno(gpl_number)
ids <- AnnoProbe::idmap('GPL570')
2. Bioconductor的注释R包(最常用)
http://www.bio-info-trainee.com/1399.html
gpl_number
if(!require(hgu133plus2.db))BiocManager::install("hgu133plus2.db")
library(hgu133plus2.db)
ls("package:hgu133plus2.db")
ids <- toTable(hgu133plus2SYMBOL)
head(ids)
3. 读取GPL平台的soft文件,按列取子集
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL570
注:soft文件列名不统一,活学活用,有的表格里没有symbol列,也有的GPL平台没有提供注释
## 网页下载注释文件,用的是.soft的文件,AnnotGPL = F
GPL_data <- getGEO(filename = "GPL10295_family.soft.gz", AnnotGPL = F)
GPL_data_1 <- Table(GPL_data)
ids <- GPL_data_1 %>%
dplyr::select(ID, Symbol)
## 生信技能树代码
if(F){
a = getGEO(gpl_number,destdir = ".")
b = a@dataTable@table
colnames(b)
ids2 = b[,c("ID","Gene Symbol")]
colnames(ids2) = c("probe_id","symbol")
ids2 = ids2[ids2$symbol!="" & !str_detect(ids2$symbol,"///"),]
}
4. 官网下载注释文件并读取
http://www.affymetrix.com/support/technical/byproduct.affx?product=hg-u133-plus
5. 自主注释
https://mp.weixin.qq.com/s/mrtjpN8yDKUdCSvSUuUwcA
三、保存数据
save(exp,Group,ids,gse_number,file = "step2output.Rdata")
