数据来源:http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz 下载.gz数据后解压
思路:用re模块中的正则匹配和find_all
## 目标:输出每条染色提的长度,gc含量,每一个碱基的个数import timeimport osimport reDir = r'/Users/Python/training/weekly/chromFa/'def job(file):with open(Dir+file) as ch:global A_content,G_content,C_content,T_content,N_content,resultresult = {}A_content = 0G_content = 0C_content = 0T_content = 0N_content = 0find_A = re.compile('A',re.I) ## 忽略大小写find_G = re.compile('G',re.I)find_C = re.compile('C', re.I)find_T = re.compile('T', re.I)find_N = re.compile('N', re.I)for line in ch:if line.startswith(">"):pass# print(line)# print(len(line))A = len(re.findall(find_A,line)) ## 找到每一行A碱基的个数,逐个相加A_content += AG = len(re.findall(find_G, line))G_content += GC = len(re.findall(find_C, line))C_content += CT = len(re.findall(find_T, line))T_content += TN = len(re.findall(find_N, line))N_content += Nresult['A_content'] = A_contentresult['G_content'] = G_contentresult['C_content'] = C_contentresult['T_content'] = T_contentresult['N_content'] = N_contentreturn (result)## 用ch1试一下,返回一个dic,key是碱基种类,value是碱基个数,用时约70s# ch1 = job('chr1.fa')# print(ch1)# {'A_content': 65570891, 'G_content': 47016562, 'C_content': 47024413, 'T_content': 65668756, 'N_content': 23970000}## 准备所有要计算的染色体files = []for i in (list(range(1,23))+['M','X','Y']):file = 'chr' +str(i) + '.fa'files.append(file)## 4个核,约2min多from concurrent.futures import ProcessPoolExecutorwith ProcessPoolExecutor(max_workers=4) as pool:content = pool.map(job,files)results = list(zip(files,content))print(results)## 计算res = []for i in range(len(results)):chrom = results[i][0].split(".")[0]dic = results[i][1]dic['length'] = sum(dic.values())dic['GC_percent'] = round((dic['G_content'] +dic['C_content'])/(dic['length']-dic['N_content']),2)dic['N_percent'] = round(dic['N_content']/dic['length'],2)dic['chromosome'] =chromres.append(dic)#print(res)## 因为是dict,很容易放入pandas的dataframe中import pandas as pddf = pd.DataFrame.from_dict(res)

然后可视化一下
import matplotlib.pyplot as pltn=25X = np.arange(n)Y1 = df['GC_percent']*100Y2 = df['length']/1000000name = df['chromosome']name = [x[3::] for x in name]print(name)plt.bar(X,-Y1,facecolor='#9999ff',edgecolor='white')plt.bar(X,Y2,facecolor='#ff9999',edgecolor='white')Y1 = [int(x) for x in Y1]Y2 = [int(x) for x in Y2]print(Y1)print(Y2)for x,y in zip(X,Y2):plt.text(x+0.1,y+0.05,'%d'%y,ha='center',va='bottom',fontsize=8)for x, y in zip(X, Y1):plt.text(x + 0.1, -y - 0.08, '%d' %y, ha='center', va='top',fontsize=8)for x,y in zip(X,name):plt.text(x+0.1,0,'%s'%y,ha='center',va='bottom',fontsize=10)plt.xlim(-.5,n)plt.ylim(-100,300)plt.title('choromsome length and GC percent')plt.yticks([-100,-50,0,100,200,300],[r'100%',r'50%',r'0',r'100',r'200',r'300'])plt.ylabel(' percent & length(M)')plt.show()

还有更快的方法,再试一下~
