TCGAbiolinks (三)获取全面的临床数据 – 璃墨的小站
library(TCGAbiolinks)
一、基础数据下载
1 下载GDC文件
query <-GDCquery(project = "TCGA-PRAD", data.category = "Clinical", file.type = "xml")
2 组合数据
#选择要提取的数据分类,可选包括:"drug","follow_up","radiation","patient","stage_event","new_tumor_event","admin"
clinical.info <- c('admin', 'patient', 'stage_event', 'new_tumor_event') # 不获取'drug', 'follow_up', 'radiation'#定义函数
f_rm_colN <- function(df, regex){df[,!grepl(regex, colnames(df))]
}
f_rm_duplicated <- function(NameL, reverse=F){tmp <- data.frame(table(NameL))if(reverse){tmp <- tmp$NameL[tmp$Freq > 1]}else{tmp <- tmp$NameL[tmp$Freq == 1]}which(NameL %in% as.character(tmp))
}#组合clinical矩阵
clinical <- list()
for(info in clinical.info){clinical[[info]] <- GDCprepare_clinic(query, clinical.info = info,directory = 'Data&Results/临床数据下载/GDCdata_full clinical data/')clinical[[info]] <- f_rm_colN(clinical[[info]], "project")
}
clinical$admin <- f_rm_colN(clinical$admin, "file_uuid")
for(info in clinical.info){clinical[[info]] <- unique(clinical[[info]])
}
f_merge <- function(lc_mergedList, by, all=T){Reduce(function(...) merge(..., by=by, all=all), lc_mergedList)
}
clinical <- f_merge(clinical, by = 'bcr_patient_barcode', all = T)
二、更新数据补丁
cl_new <- GDCquery_clinic(project = 'TCGA-PRAD', type = 'clinical')
clinical <- merge(clinical, cl_new, by = 'bcr_patient_barcode', all = T, suffixes = c('.old', '.new')) #suffixes:指定除by外相同列名的后缀。如果某列有新数据,自动加上old和new两个后缀
View(clinical)
三、生存分析补丁
1 补充总生存期(overall survival,OS)的 status 和 time
clinical$os_status <- with(clinical,ifelse(vital_status.new == 'Dead', 1, 0)) # 1表示因病死亡,0表示截尾数据
clinical$os_time <- with(clinical,ifelse(os_status == 1, days_to_death.new, days_to_last_follow_up))
如果出现了死亡,则取死亡时间;如果没有死亡,则取最后随访时间
sum(clinical$os_status)
[1] 9 得到死亡人数
2 补充无进展生存期(progression free survival,PFS)的status 和 time
PFS由 biochemical_recurrence 和 new_neoplasm_event 共同组成,优先取前者:
biochemical_recurrence 和 days_to_first_biochemical_recurrence 对应;
new_neoplasm_event 和 days_to_new_tumor_event_after_initial_treatment 对应。
1)查看数据
时间数据
table(is.na(clinical$days_to_new_tumor_event_after_initial_treatment)) #有NA
table(is.na(clinical$days_to_first_biochemical_recurrence)) #有NA
PFS状态数据
table(is.na(clinical$new_neoplasm_event_type)) #有NA。NA为缺失,''为无复发
table(is.na(clinical$biochemical_recurrence)) #无NA。''为缺失
2)将时间数据中的NA缺失值都换成 ' ',方便后续处理
clinical$days_to_new_tumor_event_after_initial_treatment = with(clinical,ifelse(is.na(days_to_new_tumor_event_after_initial_treatment),'',days_to_new_tumor_event_after_initial_treatment))
clinical$days_to_first_biochemical_recurrence = with(clinical,ifelse(is.na(days_to_first_biochemical_recurrence),'',days_to_first_biochemical_recurrence))
3)取出所有有PFS状态的数据,即排除所有确实PFS状态的数据
t_row = with(clinical, !(days_to_new_tumor_event_after_initial_treatment=='') | !(days_to_first_biochemical_recurrence=='') | !is.na(new_neoplasm_event_type) | !(biochemical_recurrence==''))clinical = clinical[t_row,]
时间数据不为 ' ' ;new_neoplasm_event_type不为NA ;biochemical_recurrence不为 ' '。均可视为有生存信息,所以对三者取交集
4)取发生PFS的时间,PFS状态,补上缺失值的观察时间
clinical$dcf_time = with(clinical,ifelse(!days_to_new_tumor_event_after_initial_treatment=='',days_to_new_tumor_event_after_initial_treatment,''))
clinical_filt$dcf_time = with(clinical,ifelse(!days_to_first_biochemical_recurrence=='',days_to_first_biochemical_recurrence,dcf_time))clinical$dcf_status = ifelse(!clinical$dcf_time=='',1,0) #1表示有dcf事件,0表示无肿瘤事件clinical$dcf_time = with(clinical ifelse(dcf_time=='',os_time,dcf_time))write.csv(clinical, file = 'clinical_with_os_dcf.csv')
先赋days_to_new_tumor_event_after_initial_treatment,再days_to_first_biochemical_recurrence