User Tools

Site Tools


Sidebar

This is the sidebar. Without it, the main text is too wide!

myillu_06_sga.sh

myIllu_06_sga.sh

#!/bin/bash -x

BASE=$(basename "$1")  # delete any leading path
BASE=${BASE%%.*}

rm temp_1.fq temp_2.fq

#
# Parameters
#

# Program paths
SGA_BIN=/usr/local/bin/sga
BWA_BIN=/usr/local/apps/bwa-0.7.12/bwa
SAMTOOLS_BIN=/usr/local/apps/samtools-1.2/samtools
BAM2DE_BIN=/usr/local/apps/sga/src/bin/sga-bam2de.pl
ASTAT_BIN=/usr/local/apps/sga/src/bin/sga-astat.py
DISTANCE_EST=/usr/local/apps/abyss-1.9.0/DistanceEst/DistanceEst
SEQTK=/usr/local/apps/seqtk/seqtk

# The number of threads to use
CPU=20

# Correction k-mer 
CORRECTION_K=41

# The minimum overlap to use when computing the graph.
# The final assembly can be performed with this overlap or greater
MIN_OVERLAP=85

# The overlap value to use for the final assembly
ASSEMBLE_OVERLAP=111

# Branch trim length
TRIM_LENGTH=400

# The minimum length of contigs to include in a scaffold
MIN_CONTIG_LENGTH=200

# The minimum number of read pairs required to link two contigs
MIN_PAIRS=10

#
# Dependency checks
#

# Check the required programs are installed and executable
prog_list="$SGA_BIN $BWA_BIN $SAMTOOLS_BIN $BAM2DE_BIN $DISTANCE_EST $ASTAT_BIN"
for prog in $prog_list; do
    hash $prog 2>/dev/null || { echo "Error $prog not found. Please place $prog on your PATH or update the *_BIN variables in this script"; exit 1; }
done 

# interleaved file should be separated for bwa mapping
echo "Separating interleaved file $1 into two temporary paired files..."
$SEQTK seq -1 $1 > temp_1.fq
$SEQTK seq -2 $1 > temp_2.fq
IN1=temp_1.fq
IN2=temp_2.fq


# Check the files are found
file_list="$IN1 $IN2"
for input in $file_list; do
    if [ ! -f $input ]; then
        echo "Error input file $input not found"; exit 1;
    fi
done

#
# Preprocessing
#

# Preprocess the data to remove ambiguous basecalls
# You can specify -m, --min-length=INT option here, which  is most useful when used in
#   conjunction with --quality trim, Default: 40
$SGA_BIN preprocess -m 101 --pe-mode 2 -o ${BASE}.pp.fastq $1

#
# Error Correction
#

# Build the index that will be used for error correction
# As the error corrector does not require the reverse BWT, suppress
# construction of the reversed index
$SGA_BIN index -a ropebwt -t $CPU --no-reverse ${BASE}.pp.fastq

# Perform k-mer based error correction.
# The k-mer cutoff parameter is learned automatically.
$SGA_BIN correct -k $CORRECTION_K --learn -t $CPU -o ${BASE}.ec.fastq ${BASE}.pp.fastq

exit
#
# Primary (contig) assembly
#

# Index the corrected data.
$SGA_BIN index -a ropebwt -t $CPU ${BASE}.ec.fastq

# Remove exact-match duplicates and ${BASE}.with low-frequency k-mers
$SGA_BIN filter -x 2 -t $CPU ${BASE}.ec.fastq

# Compute the structure of the string graph
$SGA_BIN overlap -m $MIN_OVERLAP -t $CPU ${BASE}.ec.filter.pass.fa

# Perform the contig assembly
$SGA_BIN assemble -m $ASSEMBLE_OVERLAP --min-branch-length $TRIM_LENGTH -o primary ${BASE}.ec.filter.pass.asqg.gz

#
# Scaffolding
#

PRIMARY_CONTIGS=primary-contigs.fa
PRIMARY_GRAPH=primary-graph.asqg.gz

# Align the reads to the contigs
$BWA_BIN index $PRIMARY_CONTIGS
$BWA_BIN aln -t $CPU $PRIMARY_CONTIGS $IN1 > $IN1.sai
$BWA_BIN aln -t $CPU $PRIMARY_CONTIGS $IN2 > $IN2.sai
$BWA_BIN sampe $PRIMARY_CONTIGS $IN1.sai $IN2.sai $IN1 $IN2 | $SAMTOOLS_BIN view -Sb - > libPE.bam

# Convert the BAM file into a set of contig-contig distance estimates
$BAM2DE_BIN -n $MIN_PAIRS -m $MIN_CONTIG_LENGTH --prefix libPE libPE.bam

# Compute copy number estimates of the contigs
$ASTAT_BIN -m $MIN_CONTIG_LENGTH libPE.bam > libPE.astat

# Build the scaffolds
$SGA_BIN scaffold -m $MIN_CONTIG_LENGTH -a libPE.astat -o scaffolds.scaf --pe libPE.de $PRIMARY_CONTIGS

# Convert the scaffolds to FASTA format
$SGA_BIN scaffold2fasta --use-overlap --write-unplaced -m $MIN_CONTIG_LENGTH -a $PRIMARY_GRAPH -o sga-scaffolds.fa scaffolds.scaf

# Remove temporary paired files
#rm temp_1.fq temp_2.fq temp_1.fq.sai temp_2.fq.sai
myillu_06_sga.sh.txt · Last modified: 2016/12/06 09:39 by hyjeong