User Tools

Site Tools


bioinfo:gbkinfo_script

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

bioinfo:gbkinfo_script [2019/06/05 09:50] (current)
hyjeong created
Line 1: Line 1:
 +====== gbkInfo script ======
 +GenBank flat file을 열어서 locus tag, product 등 기본 정보를 tab-delimited text file로 출력하고,​ CDS 정보는 표준출력으로 내보내는 스크립트이다. tr 옵션을 주면 아미노산 서열로 번역이 되어 출력된다. 용도에 따라 여러 가지 형태로 변형하여 사용했었지만,​ 아래에 소개한 **gbkInfo_working.pl**은 가장 포괄적인 기능을 갖도록 작성하였다.
  
 +  #​!/​usr/​bin/​perl
 +  # 
 +  # Usage - gbkInfo_working.pl GenBank_file [tr]
 +  #
 +  ​
 +  use Bio::SeqIO;
 +  ​
 +  $seqIn = Bio::​SeqIO->​new(-file => $ARGV[0], -format=>'​genbank'​);​
 +  $infoFile = $ARGV[0] . '​.txt';​
 +  $seqOut = Bio::​SeqIO->​new(-fh=>​ \*STDOUT, -format=>'​fasta'​);​
 +  ​
 +  # Can handle multiple sequence objects in a single GenBank file
 +  while (my $seqObj = $seqIn->​next_seq()) {
 +      @features = $seqObj->​get_SeqFeatures();​
 +      foreach my $feat ( @features ) {
 +          # types of primary_tag:​ source, gene, CDS, rRNA, tRNA
 +          if ($feat->​primary_tag eq '​source'​) {
 +              print STDERR "​[STDERR] Organism: ", $feat->​get_tag_values(organism),​ "​\n"​ if $feat->​has_tag(organism);​
 +          }
 +  ​
 +  # For each gene feature, check if it is pseudo or not.
 +  # Extract function info if possible.
 +  # Values will be stored in %annot hash.
 +  ​
 +          if ($feat->​primary_tag eq '​gene'​) {
 +              $num++;
 +              $locus = eval {($feat->​get_tag_values(locus_tag))[0]};​
 +              $annot{$locus}->​{'​function'​} = eval {($feat->​get_tag_values(function))[0]} if $feat->​has_tag(function);​
 +              $annot{$locus}->​{'​old_locus_tag'​} = eval {($feat->​get_tag_values(old_locus_tag))[0]} if $feat->​has_tag(old_locus_tag);​
 +              if ($feat->​has_tag(pseudo)) {
 +                  $annot{$locus}->​{'​isPseudo'​} = '​pseudo';​
 +                  push @pseudo, $locus;
 +              }
 +              if ($feat->​strand == '​1'​) {
 +                  $annot{$locus}->​{'​strand'​} = '​+';​
 +                  $annot{$locus}->​{'​stop'​} = $feat->​end;​
 +              } else {
 +                  $annot{$locus}->​{'​strand'​} = '​-';​
 +                  $annot{$locus}->​{'​stop'​} = $feat->​start;​
 +              }
 +  # start < end
 +              $annot{$locus}->​{'​start'​} = $feat->​start;​
 +              $annot{$locus}->​{'​end'​} = $feat->​end;​
 +          }
 +  ​
 +  # For each CDS feature, product/​gene/​EC_number will be extracted.
 +  # There can be multiple EC numbers in one CDS feature.
 +  #
 +  ​
 +          if ($feat->​primary_tag eq '​CDS'​) {
 +              $num2++ unless $feat->​has_tag(pseudo);​
 +              my $locus = eval {($feat->​get_tag_values(locus_tag))[0]};​
 +  ​
 +  # If there are multiple "​pseudo"​ CDS features (fragmentary) in one given locus,
 +  # process the first one only. Remaining CDS feature(s) has the same information.
 +  #
 +              next if exists $seen{$locus};​
 +              $seen{$locus} = '';​
 +  ​
 +              $annot{$locus}->​{'​product'​} = eval {($feat->​get_tag_values(product))[0]} if $feat->​has_tag(product);​
 +              $annot{$locus}->​{'​gene'​} = eval {($feat->​get_tag_values(gene))[0]} if $feat->​has_tag(gene);​
 +              $annot{$locus}->​{'​protein_id'​} = eval {($feat->​get_tag_values(protein_id))[0]} if $feat->​has_tag(protein_id);​
 +  ​
 +  # If a CDS has several EC_numbers, join them into one (ex: 3.2.2.23; 4.2.99.18)
 +  #
 +              if ($feat->​has_tag(EC_number)) {
 +                  my @tag = $feat->​get_tag_values(EC_number);​
 +                  $annot{$locus}->​{EC_number} = join "; ", @tag;
 +              }
 +  ​
 +  # Extracting gene sequences (not pseudo). ​
 +  # If you want to process specified genes only, then use $locus variable.
 +  #
 +              if ($feat->​has_tag(translation)) {
 +                  my $NTsequence = $feat->​seq->​seq();​
 +                  my $AAsequence = eval {($feat->​get_tag_values(translation))[0]};​
 +                  my $sequence = $NTsequence;​
 +                  $sequence = $AAsequence if $ARGV[1] eq '​tr';​
 +                  my $product = $annot{$locus}->​{'​product'​};​
 +  # You can add any other information to '​desc'​.
 +                  my $seqObj = Bio::​Seq->​new(-display_id => $locus,
 +                                             ​-desc ​      => $product,
 +                                             ​-seq ​       => $sequence
 +                                             );
 +                  $seqOut->​write_seq($seqObj);​
 +              }
 +          }
 +      }
 +  }
 +  ​
 +  print STDERR "​[STDERR] $ARGV[0] is $num gene features\n";​
 +  print STDERR "​[STDERR] $ARGV[0] is $num2 active CDS features (not marked as pseudo)\n";​
 +  ​
 +  # Now print the entire information!
 +  #
 +  ​
 +  ​
 +  open INFO, ">​$infoFile"​ or die "​Can'​t open $infoFile for writing!\n";​
 +  print STDERR "​[STDERR] Feature information is being written to $infoFile (pre-existing file will be overwritten!)\n";​
 +  ​
 +  print INFO join "​\t",​ '#​source',​ '​feature',​ 'locus tag', 'old locus tag', '​isPseudo?',​ '​start',​ '​end',​ '​straind',​
 +             '​stop',​ '​gene',​ '​product',​ '​protein id', 'EC number',​ '​function'​ . "​\n";​
 +  for my $locus (keys %annot) {
 +      print INFO join "​\t",​ $ARGV[0], '​protein-coding gene', $locus, $annot{$locus}->​{'​old_locus_tag'​}, ​
 +                 ​$annot{$locus}->​{'​isPseudo'​},​
 +                 ​$annot{$locus}->​{'​start'​},​ $annot{$locus}->​{'​end'​},​ $annot{$locus}->​{'​strand'​},​
 +                 ​$annot{$locus}->​{'​stop'​},​ $annot{$locus}->​{'​gene'​},​ $annot{$locus}->​{'​product'​},​
 +                 ​$annot{$locus}->​{'​protein_id'​}, ​
 +                 ​$annot{$locus}->​{'​EC_number'​},​ $annot{$locus}->​{'​function'​}
 +                 . "​\n";​
 +  }
bioinfo/gbkinfo_script.txt · Last modified: 2019/06/05 09:50 by hyjeong