技能树作业打卡
题目数据来源:ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_human/CCDS.current.txt
方法:python
解题思路:从文件中取出每条染色体上的外显子区间(起点-止点),通过range(起点,终点)的方法,得到外显子中每一个碱基的坐标,以外显子为单位,将包含碱基坐标的list转化为python中的set格式(快,默认去重),将同一个染色体中的set取并集,然后相加,over。
step1 读入数据,整理数据
## 加载包import pandas as pdimport numpy as npimport re## 数据地址file = '/home/my_python/weekly_task/CCDS.current.txt'## 通过pandas包读入数据data = pd.read_csv(file,sep = '\t',usecols=[0,9]) ## 0列:染色体编号 1列:外显子区间data.iloc[:,0].unique()## array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',##'13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X',## 'Y'], dtype=object)## 整理外显子区间,按照染色体,扔进一个大的list里genom_exon = []for i in data.iloc[:,0].unique():# print(i)data_temp_1 = data.loc[data.iloc[:,0] == i].dropna(axis=0,how='any')data_temp_2 = data_temp_1.iloc[:,1].apply(lambda x: x[1:-1]).apply(lambda x: x.split(', '))data_temp_3 = data_temp_2.apply(lambda x: pd.Series(x))data_temp_4 = np.array(data_temp_3)data_temp_5 = data_temp_4.flatten()data_i = np.unique(data_temp_5.astype(str))data_i = [exon for exon in data_i if exon != ''][:-1]genom_exon.append(data_i)len(genom_exon) ## 24,说明对了
step2 把外显子区间转换为碱基坐标的set
genome_base = [] ## 搞一个大list,hold住everythingfor i in range(len(genom_exon)): ##遍历两次,先遍历每一条染色体#print(i)#i = 14base_pool_i = []data_chrom = genom_exon[i]for i in range(len(data_chrom)): ## 再遍历每一个外显子#print(i)pattern = r'^d'if re.match(pattern,data_chrom[i]):passelse:exon_i = data_chrom[i].split('-')exon_temp = [int(x) for x in exon_i]if len(exon_temp) < 2:base_i = set(exon_temp)base_pool_i.append(base_i)else:base_temp = np.arange(exon_temp[0],exon_temp[1])base_i = set(base_temp)base_pool_i.append(base_i) ## append set 运算非常快genome_base.append(base_pool_i)len(genome_base) ## 24,大概率也对了for i in range(24):print(len(genome_base[i])) ## 查看一下每条染色体中,外显子个数
step3 把每条染色体中的set取并集
chorm = set()def job(n):base_pool = genome_base[n]global chrom ## 没有声明global,翻车chrom = set()for i in range(len(base_pool)):base_i = base_pool[i]chrom = chrom | base_i#print(len(chrom))return len(chrom)
计算量hold不住了,用多核算
from concurrent.futures import ProcessPoolExecutorimport timech = list(range(0,len(genome_base))) ## 生成用于map job的liststart = time.time()with ProcessPoolExecutor(max_workers=40) as pool: ## 多摇几个大兄弟帮忙算length = pool.map(job,ch)results = list(zip(ch,length))end = time.time()## 用时(end -start)/60# 25.634980487823487results#[(0, 2992209),(1, 1711341),(2, 1507702),(3, 1010569),(4, 1208083),(5, 1438364),(6, 1113033),(7, 827830),(8, 1102460),(9, 1088788),(10, 1526411),(11, 1366565),(12, 498138),(13, 859166),(14, 831004),(15, 1095975),(16, 1535509),(17, 400720),(18, 1617269),(19, 740145),(20, 289957),(21, 531408),(22, 1112644),(23, 76264)]
笔记本的8个核,估计会算很久(要爱护自己的电脑)
##最后,加一加total_len = 0for ch,length in results:total_len += lengthprint(total_len)## 26481554 比其他人短了很多,overlap这么多吗?
