load R packages

  1. library(dplyr)
  2. library(tibble)
  3. library(data.table)
  4. library(impute)

load phenotype

  1. phen <- read.csv("../../Study/Phenotype/clinical__information.csv")
  2. prof <- fread("../../Study/RawData/samples_expression.txt")

curation

choose index of the phenotype for survival analysis

  1. phen.cln <- phen %>%
  2. mutate(stage_event_tnm_categories = paste0(pathologic_T, pathologic_N, pathologic_M)) %>%
  3. dplyr::select(bcr_patient_barcode, gender, vital_status, days_to_death,
  4. days_to_last_followup, race, person_neoplasm_cancer_status,
  5. age_at_initial_pathologic_diagnosis, laterality,
  6. neoplasm_histologic_grade, pathologic_stage, stage_event_tnm_categories) %>%
  7. distinct(bcr_patient_barcode, .keep_all = TRUE)
  8. # dead_patient
  9. dead_patient <- phen.cln %>%
  10. dplyr::filter(vital_status == "Dead") %>%
  11. dplyr::select(-days_to_last_followup) %>%
  12. reshape::rename(c(bcr_patient_barcode = "Barcode",
  13. gender = "Gender",
  14. vital_status = "OS",
  15. days_to_death = "OS.Time",
  16. race = "Race",
  17. person_neoplasm_cancer_status = "cancer_status",
  18. age_at_initial_pathologic_diagnosis = "Age",
  19. neoplasm_histologic_grade = "Grade",
  20. pathologic_stage = "Stage",
  21. stage_event_tnm_categories = "TNM")) %>%
  22. mutate(OS = ifelse(OS == "Dead", 1, 0)) %>%
  23. mutate(OS.Time = as.numeric(as.character(OS.Time))/365)
  24. # alive_patient
  25. alive_patient <- phen.cln %>%
  26. dplyr::filter(vital_status == "Alive") %>%
  27. dplyr::select(-days_to_death) %>%
  28. reshape::rename(c(bcr_patient_barcode = "Barcode",
  29. gender = "Gender",
  30. vital_status = "OS",
  31. days_to_last_followup = "OS.Time",
  32. race = "Race",
  33. person_neoplasm_cancer_status = "cancer_status",
  34. age_at_initial_pathologic_diagnosis = "Age",
  35. neoplasm_histologic_grade = "Grade",
  36. pathologic_stage = "Stage",
  37. stage_event_tnm_categories = "TNM")) %>%
  38. mutate(OS = ifelse(OS == "Dead", 1, 0)) %>%
  39. mutate(OS.Time = as.numeric(as.character(OS.Time))/365)
  40. # combine data
  41. survival_data <- rbind(dead_patient, alive_patient) %>%
  42. mutate(Barcode = gsub("-", "_", Barcode))

filter expression matrix

  • First, the biological features were removed if having zero value in more than 20% of patients.
  • The samples were removed if missing across more than 20% features.
  • Then we used impute function from R impute package, to fill out the missing values.
  • Lastly, we removed input features with zero values across all samples.
  1. get_profile <- function(profile = prof,
  2. metadata = survival_data,
  3. occurrence = 0.2,
  4. present = 0.2){
  5. # profile = prof
  6. # metadata = survival_data
  7. # occurrence = 0.2
  8. # present = 0.2
  9. colnames(profile) <- c(colnames(profile)[1:3], gsub("-", "_", colnames(profile)[4:ncol(profile)]))
  10. filter_fill_fun <- function(tag){
  11. # tag = data_type[1]
  12. dat <- profile %>% filter(Platform == tag) %>%
  13. dplyr::select(-c("Platform", "Description"))
  14. # feature occurrence
  15. feature_occ <- apply(dat[, -1], 1, function(x){sum(x[!is.na(x)] != 0)/length(x)}) %>%
  16. data.frame() %>%
  17. setNames("feature_occurrence") %>%
  18. mutate(GeneSymbol = dat$GeneSymbol)
  19. remain_feature <- feature_occ %>% filter(feature_occurrence > occurrence)
  20. # sample occurrence
  21. sample_occ <- apply(dat[, -1], 2, function(x){sum(x[!is.na(x)] != 0)/length(x)}) %>%
  22. data.frame() %>%
  23. setNames("sample_occurrence") %>%
  24. rownames_to_column("sample")
  25. remain_sample <- sample_occ %>% filter(sample_occurrence > present)
  26. # remove duplicate features
  27. dat.cln <- dat %>% filter(GeneSymbol%in%as.character(remain_feature$GeneSymbol)) %>%
  28. dplyr::select(c("GeneSymbol", as.character(remain_sample$sample)))
  29. dat.cln$median <- apply(dat.cln[, -1], 1, median)
  30. dat.cln <- with(dat.cln, dat.cln[order(GeneSymbol, median, decreasing = T), ])
  31. dat.symbol <- dat.cln[!duplicated(dat.cln$GeneSymbol), ] %>%
  32. dplyr::select(-median) %>%
  33. column_to_rownames("GeneSymbol")
  34. # impute.knn
  35. set.seed(12345)
  36. saved.state <- .Random.seed
  37. dat.imputed <- impute.knn(as.matrix(dat.symbol))
  38. return(dat.imputed)
  39. }
  40. data_type <- unique(profile$Platform)
  41. #print(data_type)
  42. copyNumber <- filter_fill_fun(data_type[1])
  43. geneExp <- filter_fill_fun(data_type[2])
  44. methylation <- filter_fill_fun(data_type[3])
  45. protein_RPPA <- filter_fill_fun(data_type[4])
  46. copyNumber_prf <- copyNumber$data
  47. geneExp_prf <- geneExp$data
  48. methylation_prf <- methylation$data
  49. protein_RPPA_prf <- protein_RPPA$data
  50. common_sampleid <- intersect(metadata$Barcode,
  51. intersect(colnames(copyNumber_prf),
  52. intersect(colnames(geneExp_prf),
  53. intersect(colnames(methylation_prf), colnames(protein_RPPA_prf))
  54. )
  55. )
  56. )
  57. metadata.cln <- metadata %>% filter(Barcode %in% common_sampleid)
  58. copyNumber_prf.cln <- copyNumber_prf[, colnames(copyNumber_prf)%in%common_sampleid]
  59. geneExp_prf.cln <- geneExp_prf[, colnames(geneExp_prf)%in%common_sampleid]
  60. methylation_prf.cln <- methylation_prf[, colnames(methylation_prf)%in%common_sampleid]
  61. protein_RPPA_prf.cln <- protein_RPPA_prf[, colnames(protein_RPPA_prf)%in%common_sampleid]
  62. res <- list(meta = metadata.cln,
  63. CN = copyNumber_prf.cln,
  64. GE = geneExp_prf.cln,
  65. ML = methylation_prf.cln,
  66. PR = protein_RPPA_prf.cln)
  67. return(res)
  68. }
  69. data_trim_res <- get_profile(profile = prof,
  70. metadata = survival_data)

output

  1. dir_prf <- "../../Result/profile"
  2. if(!dir.exists(dir_prf)){
  3. dir.create(dir_prf)
  4. }
  5. # copyNumber
  6. copyNumber.path <- paste(dir_prf, "copyNumber_filter.tsv", sep = "/")
  7. write.table(data_trim_res$CN, file = copyNumber.path, quote = F, sep = "\t")
  8. # geneExp
  9. geneExp.path <- paste(dir_prf, "geneExp_filter.tsv", sep = "/")
  10. write.table(data_trim_res$GE, file = geneExp.path, quote = F, sep = "\t")
  11. # methylation
  12. methylation.path <- paste(dir_prf, "methylation_filter.tsv", sep = "/")
  13. write.table(data_trim_res$ML, file = methylation.path, quote = F, sep = "\t")
  14. # protein_RPPA
  15. protein_RPPA.path <- paste(dir_prf, "protein_RPPA_filter.tsv", sep = "/")
  16. write.table(data_trim_res$PR, file = protein_RPPA.path, quote = F, sep = "\t")
  17. dir_phe <- "../../Result/phenotype"
  18. if(!dir.exists(dir_phe)){
  19. dir.create(dir_phe)
  20. }
  21. # total
  22. all_survival.path <- paste(dir_phe, "all_survival_data.tsv", sep = "/")
  23. write.table(survival_data, file = all_survival.path, quote = F, sep = "\t", row.names = F)
  24. # total
  25. common_survival.path <- paste(dir_phe, "common_survival_data.tsv", sep = "/")
  26. write.table(data_trim_res$meta, file = common_survival.path, quote = F, sep = "\t", row.names = F)

version

  1. sessionInfo()
  1. R version 3.6.3 (2020-02-29)
  2. Platform: x86_64-conda-linux-gnu (64-bit)
  3. Running under: CentOS Linux 7 (Core)
  4. Matrix products: default
  5. BLAS/LAPACK: /data/share/anaconda3/lib/libopenblasp-r0.3.12.so
  6. locale:
  7. [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
  8. [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
  9. [9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
  10. attached base packages:
  11. [1] stats graphics grDevices utils datasets methods base
  12. other attached packages:
  13. [1] impute_1.60.0 ggpubr_0.4.0 ggplot2_3.3.3 data.table_1.13.6 tibble_3.0.4 dplyr_1.0.2
  14. loaded via a namespace (and not attached):
  15. [1] Rcpp_1.0.5 lattice_0.20-41 tidyr_1.1.2 rprojroot_2.0.2 assertthat_0.2.1 digest_0.6.27 R6_2.5.0
  16. [8] cellranger_1.1.0 plyr_1.8.6 backports_1.2.1 evaluate_0.14 pillar_1.4.7 rlang_0.4.10 curl_4.3
  17. [15] readxl_1.3.1 rstudioapi_0.13 car_3.0-10 vegan_2.5-7 Matrix_1.3-0 rmarkdown_2.6 desc_1.2.0
  18. [22] splines_3.6.3 stringr_1.4.0 foreign_0.8-76 munsell_0.5.0 broom_0.7.3 compiler_3.6.3 xfun_0.19
  19. [29] pkgconfig_2.0.3 mgcv_1.8-33 htmltools_0.5.0 tidyselect_1.1.0 rio_0.5.16 reshape_0.8.8 fansi_0.4.1
  20. [36] permute_0.9-5 crayon_1.3.4 withr_2.3.0 MASS_7.3-53 grid_3.6.3 nlme_3.1-150 gtable_0.3.0
  21. [43] lifecycle_0.2.0 magrittr_2.0.1 scales_1.1.1 zip_2.1.1 cli_2.2.0 stringi_1.5.3 carData_3.0-4
  22. [50] ggsignif_0.6.0 testthat_3.0.1 ellipsis_0.3.1 generics_0.1.0 vctrs_0.3.6 cowplot_1.1.1 openxlsx_4.2.3
  23. [57] tools_3.6.3 forcats_0.5.0 glue_1.4.2 purrr_0.3.4 hms_0.5.3 abind_1.4-5 pkgload_1.1.0
  24. [64] parallel_3.6.3 yaml_2.2.1 colorspace_2.0-0 cluster_2.1.0 rstatix_0.6.0 knitr_1.30 haven_2.3.1

Reference

  1. TCGA数据下载整理问题
  2. fill out the missing values