05. Computing GC Content - 《Rosalind and BioLeetCode - 生信版 Leetcode》

Problem
solution - 1
solution -2

Problem

Problem
The GC-content of a DNA string is given by the percentage of symbols in the string that are 'C' or 'G'. For example, the GC-content of "AGCTATAG" is 37.5%. Note that the reverse complement of any DNA string has the same GC-content.
DNA strings must be labeled when they are consolidated into a database. A commonly used method of string labeling is called FASTA format. In this format, the string is introduced by a line that begins with '>', followed by some labeling information. Subsequent lines contain the string itself; the first line to begin with '>' indicates the label of the next string.
In Rosalind's implementation, a string in FASTA format will be labeled by the ID "Rosalind_xxxx", where "xxxx" denotes a four-digit code between 0000 and 9999.
Given: At most 10 DNA strings in FASTA format (of length at most 1 kbp each).
Return: The ID of the string having the highest GC-content, followed by the GC-content of that string. Rosalind allows for a default error of 0.001 in all decimal answers unless otherwise stated; please see the note on absolute error below.

input: 对于给定的fasta文件
output: 计算每条序列的GC含量

# iniput
>Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC
>Rosalind_0808
CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
TGGGAACCTGCGGGCAGTAGGTGGAAT
# output
Rosalind_0808
60.919540

solution - 1

def fasta_to_dict(fasta_path):
    seq_name, seq = [], []
    with open(fasta_path) as fd:
        for line in fd.readlines():
            if line.startswith(">"):
                seq_name.append(line.strip(">").strip())
            else:
                seq.append(line.strip().upper())
        fasta_dict = {seq_name:seq for seq_name,seq in zip(seq_name, seq)}
    return fasta_dict
def count_GC(seq):
    return 100 * (seq.count("G") + seq.count("C")) / len(seq)
def get_GC_content_dict(seq_dict):
    return {k: count_GC(v) for k, v in seq_dict.items()}
def main():
    test_fasta = "your/path"
    GC_content_dict = get_GC_content_dict(fasta_to_dict(test_fasta))
    print(GC_content_dict)
if __name__ == '__main__':
    main()

solution -2

golang实现

package main
import (
    "fmt"
    "os"
    "bufio"
    "strings"
)
var fp string = "/root/Golang/data/f.fa"
func main() {
    count_map := make(map[string]float64)
    fa_name := []string{}
    fa_seq := []string{}
    f, err := os.Open(fp)
    if err != nil {
        panic("error file")
    }
    input := bufio.NewScanner(f)
    // 逐行读取
    for input.Scan() {
        s := input.Text()
        if strings.HasPrefix(s, ">") {
            fa_name = append(fa_name, strings.ReplaceAll(s, ">", ""))
        }else {
            fa_seq = append(fa_seq, s)
        }
    }
    for inx, seq := range fa_seq {
        n_C := strings.Count(seq, "C")
        n_G := strings.Count(seq, "G")
        n_percent := 100.0 * ((float64(n_C) + float64(n_G)) / float64(len(seq)))
        count_map[fa_name[inx]] = n_percent
    }
    fmt.Println(count_map)
}