技能树作业打卡
题目数据来源:ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_human/CCDS.current.txt
方法:python
解题思路:从文件中取出每条染色体上的外显子区间(起点-止点),通过range(起点,终点)的方法,得到外显子中每一个碱基的坐标,以外显子为单位,将包含碱基坐标的list转化为python中的set格式(快,默认去重),将同一个染色体中的set取并集,然后相加,over。
step1 读入数据,整理数据
## 加载包
import pandas as pd
import numpy as np
import re
## 数据地址
file = '/home/my_python/weekly_task/CCDS.current.txt'
## 通过pandas包读入数据
data = pd.read_csv(file,sep = '\t',usecols=[0,9]) ## 0列:染色体编号 1列:外显子区间
data.iloc[:,0].unique()
## array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
##'13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X',
## 'Y'], dtype=object)
## 整理外显子区间,按照染色体,扔进一个大的list里
genom_exon = []
for i in data.iloc[:,0].unique():
# print(i)
data_temp_1 = data.loc[data.iloc[:,0] == i].dropna(axis=0,how='any')
data_temp_2 = data_temp_1.iloc[:,1].apply(lambda x: x[1:-1]).apply(lambda x: x.split(', '))
data_temp_3 = data_temp_2.apply(lambda x: pd.Series(x))
data_temp_4 = np.array(data_temp_3)
data_temp_5 = data_temp_4.flatten()
data_i = np.unique(data_temp_5.astype(str))
data_i = [exon for exon in data_i if exon != ''][:-1]
genom_exon.append(data_i)
len(genom_exon) ## 24,说明对了
step2 把外显子区间转换为碱基坐标的set
genome_base = [] ## 搞一个大list,hold住everything
for i in range(len(genom_exon)): ##遍历两次,先遍历每一条染色体
#print(i)
#i = 14
base_pool_i = []
data_chrom = genom_exon[i]
for i in range(len(data_chrom)): ## 再遍历每一个外显子
#print(i)
pattern = r'^d'
if re.match(pattern,data_chrom[i]):
pass
else:
exon_i = data_chrom[i].split('-')
exon_temp = [int(x) for x in exon_i]
if len(exon_temp) < 2:
base_i = set(exon_temp)
base_pool_i.append(base_i)
else:
base_temp = np.arange(exon_temp[0],exon_temp[1])
base_i = set(base_temp)
base_pool_i.append(base_i) ## append set 运算非常快
genome_base.append(base_pool_i)
len(genome_base) ## 24,大概率也对了
for i in range(24):
print(len(genome_base[i])) ## 查看一下每条染色体中,外显子个数
step3 把每条染色体中的set取并集
chorm = set()
def job(n):
base_pool = genome_base[n]
global chrom ## 没有声明global,翻车
chrom = set()
for i in range(len(base_pool)):
base_i = base_pool[i]
chrom = chrom | base_i
#print(len(chrom))
return len(chrom)
计算量hold不住了,用多核算
from concurrent.futures import ProcessPoolExecutor
import time
ch = list(range(0,len(genome_base))) ## 生成用于map job的list
start = time.time()
with ProcessPoolExecutor(max_workers=40) as pool: ## 多摇几个大兄弟帮忙算
length = pool.map(job,ch)
results = list(zip(ch,length))
end = time.time()
## 用时
(end -start)/60
# 25.634980487823487
results
#[(0, 2992209),
(1, 1711341),
(2, 1507702),
(3, 1010569),
(4, 1208083),
(5, 1438364),
(6, 1113033),
(7, 827830),
(8, 1102460),
(9, 1088788),
(10, 1526411),
(11, 1366565),
(12, 498138),
(13, 859166),
(14, 831004),
(15, 1095975),
(16, 1535509),
(17, 400720),
(18, 1617269),
(19, 740145),
(20, 289957),
(21, 531408),
(22, 1112644),
(23, 76264)]
笔记本的8个核,估计会算很久(要爱护自己的电脑)
##最后,加一加
total_len = 0
for ch,length in results:
total_len += length
print(total_len)
## 26481554 比其他人短了很多,overlap这么多吗?