数据来源http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz 下载.gz数据后解压
    思路:用re模块中的正则匹配和find_all

    1. ## 目标:输出每条染色提的长度,gc含量,每一个碱基的个数
    2. import time
    3. import os
    4. import re
    5. Dir = r'/Users/Python/training/weekly/chromFa/'
    6. def job(file):
    7. with open(Dir+file) as ch:
    8. global A_content,G_content,C_content,T_content,N_content,result
    9. result = {}
    10. A_content = 0
    11. G_content = 0
    12. C_content = 0
    13. T_content = 0
    14. N_content = 0
    15. find_A = re.compile('A',re.I) ## 忽略大小写
    16. find_G = re.compile('G',re.I)
    17. find_C = re.compile('C', re.I)
    18. find_T = re.compile('T', re.I)
    19. find_N = re.compile('N', re.I)
    20. for line in ch:
    21. if line.startswith(">"):
    22. pass
    23. # print(line)
    24. # print(len(line))
    25. A = len(re.findall(find_A,line)) ## 找到每一行A碱基的个数,逐个相加
    26. A_content += A
    27. G = len(re.findall(find_G, line))
    28. G_content += G
    29. C = len(re.findall(find_C, line))
    30. C_content += C
    31. T = len(re.findall(find_T, line))
    32. T_content += T
    33. N = len(re.findall(find_N, line))
    34. N_content += N
    35. result['A_content'] = A_content
    36. result['G_content'] = G_content
    37. result['C_content'] = C_content
    38. result['T_content'] = T_content
    39. result['N_content'] = N_content
    40. return (result)
    41. ## 用ch1试一下,返回一个dic,key是碱基种类,value是碱基个数,用时约70s
    42. # ch1 = job('chr1.fa')
    43. # print(ch1)
    44. # {'A_content': 65570891, 'G_content': 47016562, 'C_content': 47024413, 'T_content': 65668756, 'N_content': 23970000}
    45. ## 准备所有要计算的染色体
    46. files = []
    47. for i in (list(range(1,23))+['M','X','Y']):
    48. file = 'chr' +str(i) + '.fa'
    49. files.append(file)
    50. ## 4个核,约2min多
    51. from concurrent.futures import ProcessPoolExecutor
    52. with ProcessPoolExecutor(max_workers=4) as pool:
    53. content = pool.map(job,files)
    54. results = list(zip(files,content))
    55. print(results)
    56. ## 计算
    57. res = []
    58. for i in range(len(results)):
    59. chrom = results[i][0].split(".")[0]
    60. dic = results[i][1]
    61. dic['length'] = sum(dic.values())
    62. dic['GC_percent'] = round((dic['G_content'] +dic['C_content'])/(dic['length']-dic['N_content']),2)
    63. dic['N_percent'] = round(dic['N_content']/dic['length'],2)
    64. dic['chromosome'] =chrom
    65. res.append(dic)
    66. #print(res)
    67. ## 因为是dict,很容易放入pandas的dataframe中
    68. import pandas as pd
    69. df = pd.DataFrame.from_dict(res)

    Screen Shot 2021-11-28 at 11.16.56 PM.png
    然后可视化一下

    1. import matplotlib.pyplot as plt
    2. n=25
    3. X = np.arange(n)
    4. Y1 = df['GC_percent']*100
    5. Y2 = df['length']/1000000
    6. name = df['chromosome']
    7. name = [x[3::] for x in name]
    8. print(name)
    9. plt.bar(X,-Y1,facecolor='#9999ff',edgecolor='white')
    10. plt.bar(X,Y2,facecolor='#ff9999',edgecolor='white')
    11. Y1 = [int(x) for x in Y1]
    12. Y2 = [int(x) for x in Y2]
    13. print(Y1)
    14. print(Y2)
    15. for x,y in zip(X,Y2):
    16. plt.text(x+0.1,y+0.05,'%d'%y,ha='center',va='bottom',fontsize=8)
    17. for x, y in zip(X, Y1):
    18. plt.text(x + 0.1, -y - 0.08, '%d' %y, ha='center', va='top',fontsize=8)
    19. for x,y in zip(X,name):
    20. plt.text(x+0.1,0,'%s'%y,ha='center',va='bottom',fontsize=10)
    21. plt.xlim(-.5,n)
    22. plt.ylim(-100,300)
    23. plt.title('choromsome length and GC percent')
    24. plt.yticks([-100,-50,0,100,200,300],
    25. [r'100%',r'50%',r'0',r'100',r'200',r'300'])
    26. plt.ylabel(' percent & length(M)')
    27. plt.show()

    Screen Shot 2021-11-28 at 11.18.45 PM.png
    还有更快的方法,再试一下~