How to preprocess the origin data from TCGA

load R packages
load phenotype
curation
- choose index of the phenotype for survival analysis
- filter expression matrix
output
version
Reference

load R packages

library(dplyr)
library(tibble)
library(data.table)
library(impute)

load phenotype

phen <- read.csv("../../Study/Phenotype/clinical__information.csv")
prof <- fread("../../Study/RawData/samples_expression.txt")

curation

choose index of the phenotype for survival analysis

phen.cln <- phen %>% 
  mutate(stage_event_tnm_categories = paste0(pathologic_T, pathologic_N, pathologic_M)) %>%
  dplyr::select(bcr_patient_barcode, gender, vital_status, days_to_death, 
                          days_to_last_followup, race, person_neoplasm_cancer_status, 
                          age_at_initial_pathologic_diagnosis, laterality, 
                          neoplasm_histologic_grade, pathologic_stage, stage_event_tnm_categories) %>%
  distinct(bcr_patient_barcode, .keep_all = TRUE)
# dead_patient 
dead_patient <- phen.cln %>%
  dplyr::filter(vital_status == "Dead") %>%
  dplyr::select(-days_to_last_followup) %>%
  reshape::rename(c(bcr_patient_barcode = "Barcode",
           gender = "Gender",
           vital_status = "OS",
           days_to_death = "OS.Time",
           race = "Race",
           person_neoplasm_cancer_status = "cancer_status",
           age_at_initial_pathologic_diagnosis = "Age",
           neoplasm_histologic_grade = "Grade",
           pathologic_stage = "Stage",
           stage_event_tnm_categories = "TNM")) %>%
  mutate(OS = ifelse(OS == "Dead", 1, 0)) %>%
  mutate(OS.Time = as.numeric(as.character(OS.Time))/365)
# alive_patient 
alive_patient <- phen.cln %>%
  dplyr::filter(vital_status == "Alive") %>%
  dplyr::select(-days_to_death) %>%
  reshape::rename(c(bcr_patient_barcode = "Barcode",
           gender = "Gender",
           vital_status = "OS",
           days_to_last_followup = "OS.Time",
           race = "Race",
           person_neoplasm_cancer_status = "cancer_status",
           age_at_initial_pathologic_diagnosis = "Age",
           neoplasm_histologic_grade = "Grade",
           pathologic_stage = "Stage",
           stage_event_tnm_categories = "TNM")) %>%
  mutate(OS = ifelse(OS == "Dead", 1, 0)) %>%
  mutate(OS.Time = as.numeric(as.character(OS.Time))/365)
# combine data 
survival_data <- rbind(dead_patient, alive_patient) %>%
  mutate(Barcode = gsub("-", "_", Barcode))

filter expression matrix

First, the biological features were removed if having zero value in more than 20% of patients.
The samples were removed if missing across more than 20% features.
Then we used impute function from R impute package, to fill out the missing values.
Lastly, we removed input features with zero values across all samples.

get_profile <- function(profile    = prof,
                        metadata   = survival_data,
                        occurrence = 0.2,
                        present    = 0.2){
  # profile    = prof
  # metadata   = survival_data
  # occurrence = 0.2
  # present    = 0.2
  colnames(profile) <- c(colnames(profile)[1:3], gsub("-", "_", colnames(profile)[4:ncol(profile)]))
  filter_fill_fun <- function(tag){
    # tag = data_type[1]
    dat <- profile %>% filter(Platform == tag) %>%
                dplyr::select(-c("Platform", "Description"))
    # feature occurrence
    feature_occ <- apply(dat[, -1], 1, function(x){sum(x[!is.na(x)] != 0)/length(x)}) %>% 
      data.frame() %>% 
      setNames("feature_occurrence") %>%
      mutate(GeneSymbol = dat$GeneSymbol)
    remain_feature <- feature_occ %>% filter(feature_occurrence > occurrence)
    # sample occurrence 
    sample_occ <- apply(dat[, -1], 2, function(x){sum(x[!is.na(x)] != 0)/length(x)}) %>% 
      data.frame() %>% 
      setNames("sample_occurrence") %>%
      rownames_to_column("sample")
    remain_sample <- sample_occ %>% filter(sample_occurrence > present)
    # remove duplicate features
    dat.cln <- dat %>% filter(GeneSymbol%in%as.character(remain_feature$GeneSymbol)) %>%
      dplyr::select(c("GeneSymbol", as.character(remain_sample$sample))) 
    dat.cln$median <- apply(dat.cln[, -1], 1, median)
    dat.cln <- with(dat.cln, dat.cln[order(GeneSymbol, median, decreasing = T), ])
    dat.symbol <- dat.cln[!duplicated(dat.cln$GeneSymbol), ] %>%
      dplyr::select(-median) %>% 
      column_to_rownames("GeneSymbol")
    # impute.knn 
    set.seed(12345)
    saved.state <- .Random.seed
    dat.imputed <- impute.knn(as.matrix(dat.symbol))
    return(dat.imputed)    
  }
  data_type <- unique(profile$Platform)
  #print(data_type)
  copyNumber <- filter_fill_fun(data_type[1])
  geneExp <- filter_fill_fun(data_type[2])
  methylation <- filter_fill_fun(data_type[3])
  protein_RPPA <- filter_fill_fun(data_type[4])
  copyNumber_prf <- copyNumber$data
  geneExp_prf <- geneExp$data
  methylation_prf <- methylation$data
  protein_RPPA_prf <- protein_RPPA$data
  common_sampleid <- intersect(metadata$Barcode, 
                        intersect(colnames(copyNumber_prf), 
                          intersect(colnames(geneExp_prf), 
                            intersect(colnames(methylation_prf), colnames(protein_RPPA_prf))
                          )
                        )
                      )
  metadata.cln <- metadata %>% filter(Barcode %in% common_sampleid)
  copyNumber_prf.cln <- copyNumber_prf[, colnames(copyNumber_prf)%in%common_sampleid]
  geneExp_prf.cln <- geneExp_prf[, colnames(geneExp_prf)%in%common_sampleid]
  methylation_prf.cln <- methylation_prf[, colnames(methylation_prf)%in%common_sampleid]
  protein_RPPA_prf.cln <- protein_RPPA_prf[, colnames(protein_RPPA_prf)%in%common_sampleid]
  res <- list(meta = metadata.cln, 
              CN = copyNumber_prf.cln,
              GE = geneExp_prf.cln,
              ML = methylation_prf.cln,
              PR = protein_RPPA_prf.cln)
  return(res)
}
data_trim_res <- get_profile(profile = prof,
                             metadata = survival_data)

output

dir_prf <- "../../Result/profile"
if(!dir.exists(dir_prf)){
  dir.create(dir_prf)
}
# copyNumber
copyNumber.path <- paste(dir_prf, "copyNumber_filter.tsv", sep = "/")
write.table(data_trim_res$CN, file = copyNumber.path, quote = F, sep = "\t")
# geneExp
geneExp.path <- paste(dir_prf, "geneExp_filter.tsv", sep = "/")
write.table(data_trim_res$GE, file = geneExp.path, quote = F, sep = "\t")
# methylation
methylation.path <- paste(dir_prf, "methylation_filter.tsv", sep = "/")
write.table(data_trim_res$ML, file = methylation.path, quote = F, sep = "\t")
# protein_RPPA
protein_RPPA.path <- paste(dir_prf, "protein_RPPA_filter.tsv", sep = "/")
write.table(data_trim_res$PR, file = protein_RPPA.path, quote = F, sep = "\t")
dir_phe <- "../../Result/phenotype"
if(!dir.exists(dir_phe)){
  dir.create(dir_phe)
}
# total 
all_survival.path <- paste(dir_phe, "all_survival_data.tsv", sep = "/")
write.table(survival_data, file = all_survival.path, quote = F, sep = "\t", row.names = F)
# total 
common_survival.path <- paste(dir_phe, "common_survival_data.tsv", sep = "/")
write.table(data_trim_res$meta, file = common_survival.path, quote = F, sep = "\t", row.names = F)

version

sessionInfo()

R version 3.6.3 (2020-02-29)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /data/share/anaconda3/lib/libopenblasp-r0.3.12.so
locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     
other attached packages:
[1] impute_1.60.0     ggpubr_0.4.0      ggplot2_3.3.3     data.table_1.13.6 tibble_3.0.4      dplyr_1.0.2      
loaded via a namespace (and not attached):
 [1] Rcpp_1.0.5       lattice_0.20-41  tidyr_1.1.2      rprojroot_2.0.2  assertthat_0.2.1 digest_0.6.27    R6_2.5.0        
 [8] cellranger_1.1.0 plyr_1.8.6       backports_1.2.1  evaluate_0.14    pillar_1.4.7     rlang_0.4.10     curl_4.3        
[15] readxl_1.3.1     rstudioapi_0.13  car_3.0-10       vegan_2.5-7      Matrix_1.3-0     rmarkdown_2.6    desc_1.2.0      
[22] splines_3.6.3    stringr_1.4.0    foreign_0.8-76   munsell_0.5.0    broom_0.7.3      compiler_3.6.3   xfun_0.19       
[29] pkgconfig_2.0.3  mgcv_1.8-33      htmltools_0.5.0  tidyselect_1.1.0 rio_0.5.16       reshape_0.8.8    fansi_0.4.1     
[36] permute_0.9-5    crayon_1.3.4     withr_2.3.0      MASS_7.3-53      grid_3.6.3       nlme_3.1-150     gtable_0.3.0    
[43] lifecycle_0.2.0  magrittr_2.0.1   scales_1.1.1     zip_2.1.1        cli_2.2.0        stringi_1.5.3    carData_3.0-4   
[50] ggsignif_0.6.0   testthat_3.0.1   ellipsis_0.3.1   generics_0.1.0   vctrs_0.3.6      cowplot_1.1.1    openxlsx_4.2.3  
[57] tools_3.6.3      forcats_0.5.0    glue_1.4.2       purrr_0.3.4      hms_0.5.3        abind_1.4-5      pkgload_1.1.0   
[64] parallel_3.6.3   yaml_2.2.1       colorspace_2.0-0 cluster_2.1.0    rstatix_0.6.0    knitr_1.30       haven_2.3.1

load R packages

load phenotype

curation

choose index of the phenotype for survival analysis

filter expression matrix

output

version

Reference