拆分原理

  • 软件的逻辑是首先获取barcode列表。然后采用多线程分别在fastq文件中并行提取对应barcode的reads。

  • WGS的下机数据经常出现在fastq2里。所以程序会从fastq中自动查找是否存在对应barcode。

  • 程序可以自动检测barcode始于开始还是末尾,计算hanming距离,运行1bp的mismatch。

方法

  1. julia -t number threads split_barcode.jl -b barcode list -r read2 file -l read1 file -o outpath

代码举例

  1. using BioSequences
  2. using FASTX
  3. using CodecZlib
  4. using StringDistances
  5. using Base.Threads
  6. using Getopt
  7. function read_barcode(barcode_fs::String)
  8. labels = String[]
  9. barcodes = String[]
  10. if isfile(barcode_fs)
  11. for line in eachline(barcode_fs)
  12. label, sequence = split(line)
  13. push!(labels, label)
  14. push!(barcodes, sequence)
  15. end
  16. else
  17. println("no barcode file detected!")
  18. end
  19. labels, barcodes
  20. end
  21. function detective_barcode(seq::LongDNASeq, barcode::LongDNASeq)
  22. barcode_l = length(barcode)
  23. starter = seq[1:barcode_l]
  24. ender = seq[end-barcode_l:end]
  25. dist = Hamming()(starter, barcode)
  26. if dist == 0 || dist == 1 ## if barcode in title
  27. return 1, 1
  28. else
  29. dist = Hamming()(ender, barcode)# calular hamming distance allow 1bp mismatch
  30. if dist == 0 || dist == 1
  31. return 1, 2
  32. else
  33. return -1, 0
  34. end
  35. end
  36. end
  37. function process(fq_file_1::String, fq_file_2::String, barcode::LongDNASeq, label::String, outpath::String)
  38. reader_2 = FASTQ.Reader(GzipDecompressorStream(open(fq_file_2)))
  39. reader_1 = FASTQ.Reader(GzipDecompressorStream(open(fq_file_1)))
  40. wt_1 = FASTQ.Writer(open("$(outpath)/barcode_$(label)_1.fq", lock=true, "a")) ## lock for multiple threads safety
  41. wt_2 = FASTQ.Writer(open("$(outpath)/barcode_$(label)_2.fq", lock=true, "a"))
  42. for (record_1, record_2) in zip(reader_1, reader_2)
  43. fm, weizhi = detective_barcode(FASTQ.sequence(record_2), barcode)
  44. if fm == 1
  45. write(wt_1, record_1)
  46. if weizhi == 1
  47. write(wt_2, FASTQ.Record(FASTQ.identifier(record_2), FASTQ.sequence(record_2)[11:end], FASTQ.quality(record_2)[11:end]))
  48. elseif weizhi == 2
  49. write(wt_2, FASTQ.Record(FASTQ.identifier(record_2), FASTQ.sequence(record_2)[1:100], FASTQ.quality(record_2)[1:100]))
  50. end
  51. end
  52. end
  53. close(reader_1)
  54. close(reader_2)
  55. close(wt_1)
  56. close(wt_2)
  57. end
  58. function split_barcode(barcodes_fs::String, fq_file_2::String, fq_file_1::String, outpath::String)
  59. labels, barcodes = read_barcode(barcodes_fs)
  60. barcodes = LongDNASeq.(barcodes)
  61. duiying = Dict(zip(barcodes, labels))
  62. @threads for barcode in barcodes
  63. label = duiying[barcode]
  64. process(fq_file_1, fq_file_2, barcode, label, outpath)
  65. end
  66. end
  67. function Argparse()
  68. lst = Dict{String,String}()
  69. for (opt, arg) in getopt(ARGS, "hb:r:l:o:", ["help","barcodes=", "read2=","read1=","output="])
  70. opt = replace(opt, "-" => "")
  71. arg = replace(arg, "=" => "")
  72. if opt == "help" || opt == "h"
  73. println("this program is for spliting fastq file into different part according barcodes!")
  74. println("-h\t--help\tshow this message")
  75. println("-b\t--barcodes\tinput your barcodes file, should be negetive sequennce!")
  76. println("-r2\t--read2\tinput your fastq file read_2")
  77. println("-r1\t--read1\tinput your fastq file read_1")
  78. println("-o\t--output\tinput output file path")
  79. elseif opt == "barcodes" || opt == "b"
  80. lst["barcodes"] = arg
  81. elseif opt == "read2" || opt == "r"
  82. lst["read2"] = arg
  83. elseif opt == "read1" || opt == "l"
  84. lst["read1"] = arg
  85. elseif opt == "output" || opt == "o"
  86. lst["output"] = arg
  87. else
  88. println("please check your parameter!")
  89. end
  90. end
  91. lst
  92. end
  93. args = Argparse()
  94. if length(args) > 1
  95. split_barcode(args["barcodes"], args["read2"], args["read1"], args["output"])
  96. end