Problem
ProblemThe GC-content of a DNA string is given by the percentage of symbols in the string that are 'C' or 'G'. For example, the GC-content of "AGCTATAG" is 37.5%. Note that the reverse complement of any DNA string has the same GC-content.DNA strings must be labeled when they are consolidated into a database. A commonly used method of string labeling is called FASTA format. In this format, the string is introduced by a line that begins with '>', followed by some labeling information. Subsequent lines contain the string itself; the first line to begin with '>' indicates the label of the next string.In Rosalind's implementation, a string in FASTA format will be labeled by the ID "Rosalind_xxxx", where "xxxx" denotes a four-digit code between 0000 and 9999.Given: At most 10 DNA strings in FASTA format (of length at most 1 kbp each).Return: The ID of the string having the highest GC-content, followed by the GC-content of that string. Rosalind allows for a default error of 0.001 in all decimal answers unless otherwise stated; please see the note on absolute error below.
input: 对于给定的fasta文件output: 计算每条序列的GC含量
# iniput>Rosalind_6404CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG>Rosalind_5959CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC>Rosalind_0808CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGACTGGGAACCTGCGGGCAGTAGGTGGAAT# outputRosalind_080860.919540
solution - 1
def fasta_to_dict(fasta_path):seq_name, seq = [], []with open(fasta_path) as fd:for line in fd.readlines():if line.startswith(">"):seq_name.append(line.strip(">").strip())else:seq.append(line.strip().upper())fasta_dict = {seq_name:seq for seq_name,seq in zip(seq_name, seq)}return fasta_dictdef count_GC(seq):return 100 * (seq.count("G") + seq.count("C")) / len(seq)def get_GC_content_dict(seq_dict):return {k: count_GC(v) for k, v in seq_dict.items()}def main():test_fasta = "your/path"GC_content_dict = get_GC_content_dict(fasta_to_dict(test_fasta))print(GC_content_dict)if __name__ == '__main__':main()
solution -2
golang实现
package mainimport ("fmt""os""bufio""strings")var fp string = "/root/Golang/data/f.fa"func main() {count_map := make(map[string]float64)fa_name := []string{}fa_seq := []string{}f, err := os.Open(fp)if err != nil {panic("error file")}input := bufio.NewScanner(f)// 逐行读取for input.Scan() {s := input.Text()if strings.HasPrefix(s, ">") {fa_name = append(fa_name, strings.ReplaceAll(s, ">", ""))}else {fa_seq = append(fa_seq, s)}}for inx, seq := range fa_seq {n_C := strings.Count(seq, "C")n_G := strings.Count(seq, "G")n_percent := 100.0 * ((float64(n_C) + float64(n_G)) / float64(len(seq)))count_map[fa_name[inx]] = n_percent}fmt.Println(count_map)}
