User Tools

Site Tools


myillu_06_sga.sh

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

myillu_06_sga.sh [2016/12/06 09:39] (current)
hyjeong created
Line 1: Line 1:
 +====== myIllu_06_sga.sh ======
  
 +  #!/bin/bash -x
 +  ​
 +  BASE=$(basename "​$1"​) ​ # delete any leading path
 +  BASE=${BASE%%.*}
 +  ​
 +  rm temp_1.fq temp_2.fq
 +  ​
 +  #
 +  # Parameters
 +  #
 +  ​
 +  # Program paths
 +  SGA_BIN=/​usr/​local/​bin/​sga
 +  BWA_BIN=/​usr/​local/​apps/​bwa-0.7.12/​bwa
 +  SAMTOOLS_BIN=/​usr/​local/​apps/​samtools-1.2/​samtools
 +  BAM2DE_BIN=/​usr/​local/​apps/​sga/​src/​bin/​sga-bam2de.pl
 +  ASTAT_BIN=/​usr/​local/​apps/​sga/​src/​bin/​sga-astat.py
 +  DISTANCE_EST=/​usr/​local/​apps/​abyss-1.9.0/​DistanceEst/​DistanceEst
 +  SEQTK=/​usr/​local/​apps/​seqtk/​seqtk
 +  ​
 +  # The number of threads to use
 +  CPU=20
 +  ​
 +  # Correction k-mer 
 +  CORRECTION_K=41
 +  ​
 +  # The minimum overlap to use when computing the graph.
 +  # The final assembly can be performed with this overlap or greater
 +  MIN_OVERLAP=85
 +  ​
 +  # The overlap value to use for the final assembly
 +  ASSEMBLE_OVERLAP=111
 +  ​
 +  # Branch trim length
 +  TRIM_LENGTH=400
 +  ​
 +  # The minimum length of contigs to include in a scaffold
 +  MIN_CONTIG_LENGTH=200
 +  ​
 +  # The minimum number of read pairs required to link two contigs
 +  MIN_PAIRS=10
 +  ​
 +  #
 +  # Dependency checks
 +  #
 +  ​
 +  # Check the required programs are installed and executable
 +  prog_list="​$SGA_BIN $BWA_BIN $SAMTOOLS_BIN $BAM2DE_BIN $DISTANCE_EST $ASTAT_BIN"​
 +  for prog in $prog_list; do
 +      hash $prog 2>/​dev/​null || { echo "Error $prog not found. Please place $prog on your PATH or update the *_BIN variables in this script";​ exit 1; }
 +  done 
 +  ​
 +  # interleaved file should be separated for bwa mapping
 +  echo "​Separating interleaved file $1 into two temporary paired files..."​
 +  $SEQTK seq -1 $1 > temp_1.fq
 +  $SEQTK seq -2 $1 > temp_2.fq
 +  IN1=temp_1.fq
 +  IN2=temp_2.fq
 +  ​
 +  ​
 +  # Check the files are found
 +  file_list="​$IN1 $IN2"
 +  for input in $file_list; do
 +      if [ ! -f $input ]; then
 +          echo "Error input file $input not found";​ exit 1;
 +      fi
 +  done
 +  ​
 +  #
 +  # Preprocessing
 +  #
 +  ​
 +  # Preprocess the data to remove ambiguous basecalls
 +  # You can specify -m, --min-length=INT option here, which  is most useful when used in
 +  #   ​conjunction with --quality trim, Default: 40
 +  $SGA_BIN preprocess -m 101 --pe-mode 2 -o ${BASE}.pp.fastq $1
 +  ​
 +  #
 +  # Error Correction
 +  #
 +  ​
 +  # Build the index that will be used for error correction
 +  # As the error corrector does not require the reverse BWT, suppress
 +  # construction of the reversed index
 +  $SGA_BIN index -a ropebwt -t $CPU --no-reverse ${BASE}.pp.fastq
 +  ​
 +  # Perform k-mer based error correction.
 +  # The k-mer cutoff parameter is learned automatically.
 +  $SGA_BIN correct -k $CORRECTION_K --learn -t $CPU -o ${BASE}.ec.fastq ${BASE}.pp.fastq
 +  ​
 +  exit
 +  #
 +  # Primary (contig) assembly
 +  #
 +  ​
 +  # Index the corrected data.
 +  $SGA_BIN index -a ropebwt -t $CPU ${BASE}.ec.fastq
 +  ​
 +  # Remove exact-match duplicates and ${BASE}.with low-frequency k-mers
 +  $SGA_BIN filter -x 2 -t $CPU ${BASE}.ec.fastq
 +  ​
 +  # Compute the structure of the string graph
 +  $SGA_BIN overlap -m $MIN_OVERLAP -t $CPU ${BASE}.ec.filter.pass.fa
 +  ​
 +  # Perform the contig assembly
 +  $SGA_BIN assemble -m $ASSEMBLE_OVERLAP --min-branch-length $TRIM_LENGTH -o primary ${BASE}.ec.filter.pass.asqg.gz
 +  ​
 +  #
 +  # Scaffolding
 +  #
 +  ​
 +  PRIMARY_CONTIGS=primary-contigs.fa
 +  PRIMARY_GRAPH=primary-graph.asqg.gz
 +  ​
 +  # Align the reads to the contigs
 +  $BWA_BIN index $PRIMARY_CONTIGS
 +  $BWA_BIN aln -t $CPU $PRIMARY_CONTIGS $IN1 > $IN1.sai
 +  $BWA_BIN aln -t $CPU $PRIMARY_CONTIGS $IN2 > $IN2.sai
 +  $BWA_BIN sampe $PRIMARY_CONTIGS $IN1.sai $IN2.sai $IN1 $IN2 | $SAMTOOLS_BIN view -Sb - > libPE.bam
 +  ​
 +  # Convert the BAM file into a set of contig-contig distance estimates
 +  $BAM2DE_BIN -n $MIN_PAIRS -m $MIN_CONTIG_LENGTH --prefix libPE libPE.bam
 +  ​
 +  # Compute copy number estimates of the contigs
 +  $ASTAT_BIN -m $MIN_CONTIG_LENGTH libPE.bam > libPE.astat
 +  ​
 +  # Build the scaffolds
 +  $SGA_BIN scaffold -m $MIN_CONTIG_LENGTH -a libPE.astat -o scaffolds.scaf --pe libPE.de $PRIMARY_CONTIGS
 +  ​
 +  # Convert the scaffolds to FASTA format
 +  $SGA_BIN scaffold2fasta --use-overlap --write-unplaced -m $MIN_CONTIG_LENGTH -a $PRIMARY_GRAPH -o sga-scaffolds.fa scaffolds.scaf
 +  ​
 +  # Remove temporary paired files
 +  #rm temp_1.fq temp_2.fq temp_1.fq.sai temp_2.fq.sai
myillu_06_sga.sh.txt ยท Last modified: 2016/12/06 09:39 by hyjeong