load R packages
library(dplyr)library(tibble)library(data.table)library(impute)
load phenotype
phen <- read.csv("../../Study/Phenotype/clinical__information.csv")prof <- fread("../../Study/RawData/samples_expression.txt")
curation
choose index of the phenotype for survival analysis
phen.cln <- phen %>% mutate(stage_event_tnm_categories = paste0(pathologic_T, pathologic_N, pathologic_M)) %>% dplyr::select(bcr_patient_barcode, gender, vital_status, days_to_death, days_to_last_followup, race, person_neoplasm_cancer_status, age_at_initial_pathologic_diagnosis, laterality, neoplasm_histologic_grade, pathologic_stage, stage_event_tnm_categories) %>% distinct(bcr_patient_barcode, .keep_all = TRUE)# dead_patient dead_patient <- phen.cln %>% dplyr::filter(vital_status == "Dead") %>% dplyr::select(-days_to_last_followup) %>% reshape::rename(c(bcr_patient_barcode = "Barcode", gender = "Gender", vital_status = "OS", days_to_death = "OS.Time", race = "Race", person_neoplasm_cancer_status = "cancer_status", age_at_initial_pathologic_diagnosis = "Age", neoplasm_histologic_grade = "Grade", pathologic_stage = "Stage", stage_event_tnm_categories = "TNM")) %>% mutate(OS = ifelse(OS == "Dead", 1, 0)) %>% mutate(OS.Time = as.numeric(as.character(OS.Time))/365)# alive_patient alive_patient <- phen.cln %>% dplyr::filter(vital_status == "Alive") %>% dplyr::select(-days_to_death) %>% reshape::rename(c(bcr_patient_barcode = "Barcode", gender = "Gender", vital_status = "OS", days_to_last_followup = "OS.Time", race = "Race", person_neoplasm_cancer_status = "cancer_status", age_at_initial_pathologic_diagnosis = "Age", neoplasm_histologic_grade = "Grade", pathologic_stage = "Stage", stage_event_tnm_categories = "TNM")) %>% mutate(OS = ifelse(OS == "Dead", 1, 0)) %>% mutate(OS.Time = as.numeric(as.character(OS.Time))/365)# combine data survival_data <- rbind(dead_patient, alive_patient) %>% mutate(Barcode = gsub("-", "_", Barcode))
filter expression matrix
- First, the biological features were removed if having zero value in more than 20% of patients.
- The samples were removed if missing across more than 20% features.
- Then we used impute function from R impute package, to fill out the missing values.
- Lastly, we removed input features with zero values across all samples.
get_profile <- function(profile = prof, metadata = survival_data, occurrence = 0.2, present = 0.2){ # profile = prof # metadata = survival_data # occurrence = 0.2 # present = 0.2 colnames(profile) <- c(colnames(profile)[1:3], gsub("-", "_", colnames(profile)[4:ncol(profile)])) filter_fill_fun <- function(tag){ # tag = data_type[1] dat <- profile %>% filter(Platform == tag) %>% dplyr::select(-c("Platform", "Description")) # feature occurrence feature_occ <- apply(dat[, -1], 1, function(x){sum(x[!is.na(x)] != 0)/length(x)}) %>% data.frame() %>% setNames("feature_occurrence") %>% mutate(GeneSymbol = dat$GeneSymbol) remain_feature <- feature_occ %>% filter(feature_occurrence > occurrence) # sample occurrence sample_occ <- apply(dat[, -1], 2, function(x){sum(x[!is.na(x)] != 0)/length(x)}) %>% data.frame() %>% setNames("sample_occurrence") %>% rownames_to_column("sample") remain_sample <- sample_occ %>% filter(sample_occurrence > present) # remove duplicate features dat.cln <- dat %>% filter(GeneSymbol%in%as.character(remain_feature$GeneSymbol)) %>% dplyr::select(c("GeneSymbol", as.character(remain_sample$sample))) dat.cln$median <- apply(dat.cln[, -1], 1, median) dat.cln <- with(dat.cln, dat.cln[order(GeneSymbol, median, decreasing = T), ]) dat.symbol <- dat.cln[!duplicated(dat.cln$GeneSymbol), ] %>% dplyr::select(-median) %>% column_to_rownames("GeneSymbol") # impute.knn set.seed(12345) saved.state <- .Random.seed dat.imputed <- impute.knn(as.matrix(dat.symbol)) return(dat.imputed) } data_type <- unique(profile$Platform) #print(data_type) copyNumber <- filter_fill_fun(data_type[1]) geneExp <- filter_fill_fun(data_type[2]) methylation <- filter_fill_fun(data_type[3]) protein_RPPA <- filter_fill_fun(data_type[4]) copyNumber_prf <- copyNumber$data geneExp_prf <- geneExp$data methylation_prf <- methylation$data protein_RPPA_prf <- protein_RPPA$data common_sampleid <- intersect(metadata$Barcode, intersect(colnames(copyNumber_prf), intersect(colnames(geneExp_prf), intersect(colnames(methylation_prf), colnames(protein_RPPA_prf)) ) ) ) metadata.cln <- metadata %>% filter(Barcode %in% common_sampleid) copyNumber_prf.cln <- copyNumber_prf[, colnames(copyNumber_prf)%in%common_sampleid] geneExp_prf.cln <- geneExp_prf[, colnames(geneExp_prf)%in%common_sampleid] methylation_prf.cln <- methylation_prf[, colnames(methylation_prf)%in%common_sampleid] protein_RPPA_prf.cln <- protein_RPPA_prf[, colnames(protein_RPPA_prf)%in%common_sampleid] res <- list(meta = metadata.cln, CN = copyNumber_prf.cln, GE = geneExp_prf.cln, ML = methylation_prf.cln, PR = protein_RPPA_prf.cln) return(res)}data_trim_res <- get_profile(profile = prof, metadata = survival_data)
output
dir_prf <- "../../Result/profile"if(!dir.exists(dir_prf)){ dir.create(dir_prf)}# copyNumbercopyNumber.path <- paste(dir_prf, "copyNumber_filter.tsv", sep = "/")write.table(data_trim_res$CN, file = copyNumber.path, quote = F, sep = "\t")# geneExpgeneExp.path <- paste(dir_prf, "geneExp_filter.tsv", sep = "/")write.table(data_trim_res$GE, file = geneExp.path, quote = F, sep = "\t")# methylationmethylation.path <- paste(dir_prf, "methylation_filter.tsv", sep = "/")write.table(data_trim_res$ML, file = methylation.path, quote = F, sep = "\t")# protein_RPPAprotein_RPPA.path <- paste(dir_prf, "protein_RPPA_filter.tsv", sep = "/")write.table(data_trim_res$PR, file = protein_RPPA.path, quote = F, sep = "\t")dir_phe <- "../../Result/phenotype"if(!dir.exists(dir_phe)){ dir.create(dir_phe)}# total all_survival.path <- paste(dir_phe, "all_survival_data.tsv", sep = "/")write.table(survival_data, file = all_survival.path, quote = F, sep = "\t", row.names = F)# total common_survival.path <- paste(dir_phe, "common_survival_data.tsv", sep = "/")write.table(data_trim_res$meta, file = common_survival.path, quote = F, sep = "\t", row.names = F)
version
sessionInfo()
R version 3.6.3 (2020-02-29)Platform: x86_64-conda-linux-gnu (64-bit)Running under: CentOS Linux 7 (Core)Matrix products: defaultBLAS/LAPACK: /data/share/anaconda3/lib/libopenblasp-r0.3.12.solocale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C [9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C attached base packages:[1] stats graphics grDevices utils datasets methods base other attached packages:[1] impute_1.60.0 ggpubr_0.4.0 ggplot2_3.3.3 data.table_1.13.6 tibble_3.0.4 dplyr_1.0.2 loaded via a namespace (and not attached): [1] Rcpp_1.0.5 lattice_0.20-41 tidyr_1.1.2 rprojroot_2.0.2 assertthat_0.2.1 digest_0.6.27 R6_2.5.0 [8] cellranger_1.1.0 plyr_1.8.6 backports_1.2.1 evaluate_0.14 pillar_1.4.7 rlang_0.4.10 curl_4.3 [15] readxl_1.3.1 rstudioapi_0.13 car_3.0-10 vegan_2.5-7 Matrix_1.3-0 rmarkdown_2.6 desc_1.2.0 [22] splines_3.6.3 stringr_1.4.0 foreign_0.8-76 munsell_0.5.0 broom_0.7.3 compiler_3.6.3 xfun_0.19 [29] pkgconfig_2.0.3 mgcv_1.8-33 htmltools_0.5.0 tidyselect_1.1.0 rio_0.5.16 reshape_0.8.8 fansi_0.4.1 [36] permute_0.9-5 crayon_1.3.4 withr_2.3.0 MASS_7.3-53 grid_3.6.3 nlme_3.1-150 gtable_0.3.0 [43] lifecycle_0.2.0 magrittr_2.0.1 scales_1.1.1 zip_2.1.1 cli_2.2.0 stringi_1.5.3 carData_3.0-4 [50] ggsignif_0.6.0 testthat_3.0.1 ellipsis_0.3.1 generics_0.1.0 vctrs_0.3.6 cowplot_1.1.1 openxlsx_4.2.3 [57] tools_3.6.3 forcats_0.5.0 glue_1.4.2 purrr_0.3.4 hms_0.5.3 abind_1.4-5 pkgload_1.1.0 [64] parallel_3.6.3 yaml_2.2.1 colorspace_2.0-0 cluster_2.1.0 rstatix_0.6.0 knitr_1.30 haven_2.3.1
Reference
- TCGA数据下载整理问题
- fill out the missing values