技能树作业打卡
    题目数据来源:ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_human/CCDS.current.txt
    方法:python
    解题思路:从文件中取出每条染色体上的外显子区间(起点-止点),通过range(起点,终点)的方法,得到外显子中每一个碱基的坐标,以外显子为单位,将包含碱基坐标的list转化为python中的set格式(快,默认去重),将同一个染色体中的set取并集,然后相加,over。
    step1 读入数据,整理数据

    1. ## 加载包
    2. import pandas as pd
    3. import numpy as np
    4. import re
    5. ## 数据地址
    6. file = '/home/my_python/weekly_task/CCDS.current.txt'
    7. ## 通过pandas包读入数据
    8. data = pd.read_csv(file,sep = '\t',usecols=[0,9]) ## 0列:染色体编号 1列:外显子区间
    9. data.iloc[:,0].unique()
    10. ## array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
    11. ##'13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X',
    12. ## 'Y'], dtype=object)
    13. ## 整理外显子区间,按照染色体,扔进一个大的list里
    14. genom_exon = []
    15. for i in data.iloc[:,0].unique():
    16. # print(i)
    17. data_temp_1 = data.loc[data.iloc[:,0] == i].dropna(axis=0,how='any')
    18. data_temp_2 = data_temp_1.iloc[:,1].apply(lambda x: x[1:-1]).apply(lambda x: x.split(', '))
    19. data_temp_3 = data_temp_2.apply(lambda x: pd.Series(x))
    20. data_temp_4 = np.array(data_temp_3)
    21. data_temp_5 = data_temp_4.flatten()
    22. data_i = np.unique(data_temp_5.astype(str))
    23. data_i = [exon for exon in data_i if exon != ''][:-1]
    24. genom_exon.append(data_i)
    25. len(genom_exon) ## 24,说明对了

    step2 把外显子区间转换为碱基坐标的set

    1. genome_base = [] ## 搞一个大list,hold住everything
    2. for i in range(len(genom_exon)): ##遍历两次,先遍历每一条染色体
    3. #print(i)
    4. #i = 14
    5. base_pool_i = []
    6. data_chrom = genom_exon[i]
    7. for i in range(len(data_chrom)): ## 再遍历每一个外显子
    8. #print(i)
    9. pattern = r'^d'
    10. if re.match(pattern,data_chrom[i]):
    11. pass
    12. else:
    13. exon_i = data_chrom[i].split('-')
    14. exon_temp = [int(x) for x in exon_i]
    15. if len(exon_temp) < 2:
    16. base_i = set(exon_temp)
    17. base_pool_i.append(base_i)
    18. else:
    19. base_temp = np.arange(exon_temp[0],exon_temp[1])
    20. base_i = set(base_temp)
    21. base_pool_i.append(base_i) ## append set 运算非常快
    22. genome_base.append(base_pool_i)
    23. len(genome_base) ## 24,大概率也对了
    24. for i in range(24):
    25. print(len(genome_base[i])) ## 查看一下每条染色体中,外显子个数

    step3 把每条染色体中的set取并集

    1. chorm = set()
    2. def job(n):
    3. base_pool = genome_base[n]
    4. global chrom ## 没有声明global,翻车
    5. chrom = set()
    6. for i in range(len(base_pool)):
    7. base_i = base_pool[i]
    8. chrom = chrom | base_i
    9. #print(len(chrom))
    10. return len(chrom)

    计算量hold不住了,用多核算

    1. from concurrent.futures import ProcessPoolExecutor
    2. import time
    3. ch = list(range(0,len(genome_base))) ## 生成用于map job的list
    4. start = time.time()
    5. with ProcessPoolExecutor(max_workers=40) as pool: ## 多摇几个大兄弟帮忙算
    6. length = pool.map(job,ch)
    7. results = list(zip(ch,length))
    8. end = time.time()
    9. ## 用时
    10. (end -start)/60
    11. # 25.634980487823487
    12. results
    13. #[(0, 2992209),
    14. (1, 1711341),
    15. (2, 1507702),
    16. (3, 1010569),
    17. (4, 1208083),
    18. (5, 1438364),
    19. (6, 1113033),
    20. (7, 827830),
    21. (8, 1102460),
    22. (9, 1088788),
    23. (10, 1526411),
    24. (11, 1366565),
    25. (12, 498138),
    26. (13, 859166),
    27. (14, 831004),
    28. (15, 1095975),
    29. (16, 1535509),
    30. (17, 400720),
    31. (18, 1617269),
    32. (19, 740145),
    33. (20, 289957),
    34. (21, 531408),
    35. (22, 1112644),
    36. (23, 76264)]

    笔记本的8个核,估计会算很久(要爱护自己的电脑)

    1. ##最后,加一加
    2. total_len = 0
    3. for ch,length in results:
    4. total_len += length
    5. print(total_len)
    6. ## 26481554 比其他人短了很多,overlap这么多吗?