Problem

  1. Problem
  2. The GC-content of a DNA string is given by the percentage of symbols in the string that are 'C' or 'G'. For example, the GC-content of "AGCTATAG" is 37.5%. Note that the reverse complement of any DNA string has the same GC-content.
  3. DNA strings must be labeled when they are consolidated into a database. A commonly used method of string labeling is called FASTA format. In this format, the string is introduced by a line that begins with '>', followed by some labeling information. Subsequent lines contain the string itself; the first line to begin with '>' indicates the label of the next string.
  4. In Rosalind's implementation, a string in FASTA format will be labeled by the ID "Rosalind_xxxx", where "xxxx" denotes a four-digit code between 0000 and 9999.
  5. Given: At most 10 DNA strings in FASTA format (of length at most 1 kbp each).
  6. Return: The ID of the string having the highest GC-content, followed by the GC-content of that string. Rosalind allows for a default error of 0.001 in all decimal answers unless otherwise stated; please see the note on absolute error below.

input: 对于给定的fasta文件
output: 计算每条序列的GC含量

  1. # iniput
  2. >Rosalind_6404
  3. CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
  4. TCCCACTAATAATTCTGAGG
  5. >Rosalind_5959
  6. CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
  7. ATATCCATTTGTCAGCAGACACGC
  8. >Rosalind_0808
  9. CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
  10. TGGGAACCTGCGGGCAGTAGGTGGAAT
  11. # output
  12. Rosalind_0808
  13. 60.919540

solution - 1

  1. def fasta_to_dict(fasta_path):
  2. seq_name, seq = [], []
  3. with open(fasta_path) as fd:
  4. for line in fd.readlines():
  5. if line.startswith(">"):
  6. seq_name.append(line.strip(">").strip())
  7. else:
  8. seq.append(line.strip().upper())
  9. fasta_dict = {seq_name:seq for seq_name,seq in zip(seq_name, seq)}
  10. return fasta_dict
  11. def count_GC(seq):
  12. return 100 * (seq.count("G") + seq.count("C")) / len(seq)
  13. def get_GC_content_dict(seq_dict):
  14. return {k: count_GC(v) for k, v in seq_dict.items()}
  15. def main():
  16. test_fasta = "your/path"
  17. GC_content_dict = get_GC_content_dict(fasta_to_dict(test_fasta))
  18. print(GC_content_dict)
  19. if __name__ == '__main__':
  20. main()

solution -2

golang实现

  1. package main
  2. import (
  3. "fmt"
  4. "os"
  5. "bufio"
  6. "strings"
  7. )
  8. var fp string = "/root/Golang/data/f.fa"
  9. func main() {
  10. count_map := make(map[string]float64)
  11. fa_name := []string{}
  12. fa_seq := []string{}
  13. f, err := os.Open(fp)
  14. if err != nil {
  15. panic("error file")
  16. }
  17. input := bufio.NewScanner(f)
  18. // 逐行读取
  19. for input.Scan() {
  20. s := input.Text()
  21. if strings.HasPrefix(s, ">") {
  22. fa_name = append(fa_name, strings.ReplaceAll(s, ">", ""))
  23. }else {
  24. fa_seq = append(fa_seq, s)
  25. }
  26. }
  27. for inx, seq := range fa_seq {
  28. n_C := strings.Count(seq, "C")
  29. n_G := strings.Count(seq, "G")
  30. n_percent := 100.0 * ((float64(n_C) + float64(n_G)) / float64(len(seq)))
  31. count_map[fa_name[inx]] = n_percent
  32. }
  33. fmt.Println(count_map)
  34. }