拆分原理
软件的逻辑是首先获取barcode列表。然后采用多线程分别在fastq文件中并行提取对应barcode的reads。
WGS的下机数据经常出现在fastq2里。所以程序会从fastq中自动查找是否存在对应barcode。
程序可以自动检测barcode始于开始还是末尾,计算hanming距离,运行1bp的mismatch。
方法
julia -t number threads split_barcode.jl -b barcode list -r read2 file -l read1 file -o outpath
代码举例
using BioSequencesusing FASTXusing CodecZlibusing StringDistancesusing Base.Threadsusing Getoptfunction read_barcode(barcode_fs::String)labels = String[]barcodes = String[]if isfile(barcode_fs)for line in eachline(barcode_fs)label, sequence = split(line)push!(labels, label)push!(barcodes, sequence)endelseprintln("no barcode file detected!")endlabels, barcodesendfunction detective_barcode(seq::LongDNASeq, barcode::LongDNASeq)barcode_l = length(barcode)starter = seq[1:barcode_l]ender = seq[end-barcode_l:end]dist = Hamming()(starter, barcode)if dist == 0 || dist == 1 ## if barcode in titlereturn 1, 1elsedist = Hamming()(ender, barcode)# calular hamming distance allow 1bp mismatchif dist == 0 || dist == 1return 1, 2elsereturn -1, 0endendendfunction process(fq_file_1::String, fq_file_2::String, barcode::LongDNASeq, label::String, outpath::String)reader_2 = FASTQ.Reader(GzipDecompressorStream(open(fq_file_2)))reader_1 = FASTQ.Reader(GzipDecompressorStream(open(fq_file_1)))wt_1 = FASTQ.Writer(open("$(outpath)/barcode_$(label)_1.fq", lock=true, "a")) ## lock for multiple threads safetywt_2 = FASTQ.Writer(open("$(outpath)/barcode_$(label)_2.fq", lock=true, "a"))for (record_1, record_2) in zip(reader_1, reader_2)fm, weizhi = detective_barcode(FASTQ.sequence(record_2), barcode)if fm == 1write(wt_1, record_1)if weizhi == 1write(wt_2, FASTQ.Record(FASTQ.identifier(record_2), FASTQ.sequence(record_2)[11:end], FASTQ.quality(record_2)[11:end]))elseif weizhi == 2write(wt_2, FASTQ.Record(FASTQ.identifier(record_2), FASTQ.sequence(record_2)[1:100], FASTQ.quality(record_2)[1:100]))endendendclose(reader_1)close(reader_2)close(wt_1)close(wt_2)endfunction split_barcode(barcodes_fs::String, fq_file_2::String, fq_file_1::String, outpath::String)labels, barcodes = read_barcode(barcodes_fs)barcodes = LongDNASeq.(barcodes)duiying = Dict(zip(barcodes, labels))@threads for barcode in barcodeslabel = duiying[barcode]process(fq_file_1, fq_file_2, barcode, label, outpath)endendfunction Argparse()lst = Dict{String,String}()for (opt, arg) in getopt(ARGS, "hb:r:l:o:", ["help","barcodes=", "read2=","read1=","output="])opt = replace(opt, "-" => "")arg = replace(arg, "=" => "")if opt == "help" || opt == "h"println("this program is for spliting fastq file into different part according barcodes!")println("-h\t--help\tshow this message")println("-b\t--barcodes\tinput your barcodes file, should be negetive sequennce!")println("-r2\t--read2\tinput your fastq file read_2")println("-r1\t--read1\tinput your fastq file read_1")println("-o\t--output\tinput output file path")elseif opt == "barcodes" || opt == "b"lst["barcodes"] = argelseif opt == "read2" || opt == "r"lst["read2"] = argelseif opt == "read1" || opt == "l"lst["read1"] = argelseif opt == "output" || opt == "o"lst["output"] = argelseprintln("please check your parameter!")endendlstendargs = Argparse()if length(args) > 1split_barcode(args["barcodes"], args["read2"], args["read1"], args["output"])end
