User Tools

Site Tools


2.rnaseq

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

2.rnaseq [2017/12/15 15:40] (current)
hyjeong created
Line 1: Line 1:
 +  ##### installing packages example (will not do in workshop) 
 +  tr() 
 +  setRepositories() 
 +  install.packages("​psych"​) 
 +  library(psych) 
 +  tr() 
 +  m <- matrix(1:​16,​ncol=4) 
 +  tr(m) 
 +  #####################################​ 
 +   
 +  #################​ 
 +  #  Basic plots  # 
 +  #################​ 
 +  x11(width = 6, height = 6)   #code to open up a window in linux 
 +  counts <- matrix( rnorm(200,​mean=0,​sd=1),​20,​10 )   #make a normal count 
 +  boxplot( counts[,​1],​counts[,​3],​counts[,​5],​counts[,​7],​counts[,​9],​counts[,​10] , col=c("​black","​blue","​green","​red","​purple","​yellow"​) ) 
 +  boxplot( counts[,​1],​counts[,​3],​counts[,​5],​counts[,​7],​counts[,​9],​counts[,​10] , col=c("​red","​red","​red","​blue","​blue","​blue"​) ) 
 +  test_data <- c(1,​2,​2,​3,​7,​1,​2,​3,​10,​11,​25,​34,​3,​6,​7,​9,​9,​10,​10,​10) ​ #make test data 
 +  hist(test_data) 
 +  test_data <- rnorm(500, mean = 0, sd = 1) 
 +  hist(test_data) 
 +  hist(test_data,​ main = "​Histogram of Test Dataset",​ xlab = "​Trait",​ ylab = "​Individual Counts"​ ) 
 +  dev.off() #or close the window 
 +   
 +  #실습 1교시 마무리. 
 +   
 +  ## Set the working directory (not needed for workshop) 
 +  #​setwd("​C:​\\PATH\\"​) ## in pc 
 +  #​setwd("/​home/​2.RNASeq_practice/​5.HTseq_simulated"​) ​  ##in linux 
 +   
 +  ## Call necessary libraries (these are preinstalled [ex:​install.packages("​limma"​)]) 
 +  library(statmod) 
 +  library(locfit) 
 +  library(edgeR) 
 +  library(limma) 
 +  library(DESeq2) 
 +   
 +  ## note the bad samples (6 and 10) 
 +  ## save used samples'​ index to Sam_Index 
 +  Samp_Index <- c(1:​5,​7:​9,​11:​23) 
 +   
 +  ## or you can do 
 +  Samp_Index <- c(1:23) 
 +  Samp_Index <- Samp_Index[-c(6,​10)] 
 +   
 +  ## take out bad samples 
 +  meta_data <- read.table("​meta_data_mRNA.txt",​ sep="​\t",​ head = T) 
 +  File_list <- as.character(meta_data[,​1]) 
 +  File_list <- File_list[Samp_Index] 
 +  head(File_list) #check first few component of File_list 
 +   
 +  ### Data Read (first file) 
 +  temp <- read.table(File_list[1],​ sep="​\t",​ head=F) 
 +  temp <- temp[-((nrow(temp)-4):​nrow(temp)),​] ​ #remove last 5 lines 
 +   
 +  Gene_Symbol <- as.character(temp[,​1]) 
 +  Expression <- data.frame(temp[,​2]) 
 +   
 +  ## Merge matrix (read all other good sample files and append) 
 +  for(i in 2:​length(File_list)){ ​  #from file_list, 2nd to last files are called since first file is already called 
 +     temp <- read.table(File_list[i],​ sep="​\t",​ head=F) 
 +     temp <- temp[-((nrow(temp)-4):​nrow(temp)),​] # remove last 5 lines same as the first file 
 +   
 +     ​Expression <- data.frame(Expression,​ temp[,2]) 
 +  } 
 +  colnames(Expression) <- File_list #​samples(columns) are marked with their original file name 
 +  head(Expression) #check first few lines of Expression 
 +  dim(Expression) #check dimension of Expression 
 +  head(Gene_Symbol) 
 +   
 +  #### remove all zero gene for mRNA data  #from original 14218 genes 
 +  Removed_ID <- which(rowSums(Expression) == 0)   #1067 genes removed 
 +  Expression <- Expression[-Removed_ID,​] 
 +  Gene_Symbol <- Gene_Symbol[-Removed_ID] 
 +  dim(Expression) 
 +  length(Gene_Symbol) ​  #​13151 
 +   
 +  #### gene annotation ##### 
 +  ####after annotation file '#​bin'​ delete 
 +  Gene_annotation <- read.table("​workshop_practice.annotation",​head=T,​ stringsAsFactors=FALSE) 
 +  Gene_annotation <- Gene_annotation[,​c(2,​13)] ​ ## get information we need: col_2 and col_3 
 +  #match Gene_name with Gene_Symbol 
 +  Gene_name <- c() 
 +  for ( i in 1:​length(Gene_Symbol) ){ 
 +  Gene_name[i]<​- Gene_annotation[match(Gene_Symbol[i],​Gene_annotation[,​1]),​2] 
 +  } 
 +   
 +  ## Start RNAseq analysis 
 +  #########################################​ 
 +  ### DEG analysis Set for 1-way Analysis 
 +  #########################################​ 
 +  Treat <- factor( meta_data[,​3],​ levels = c("​male","​female"​) ) 
 +  Treat <- Treat[Samp_Index] ​ #dimension correction 
 +  targets <- data.frame(Treat) 
 +   
 +  ####################################################​ 
 +  ### Get normalized TMM values 
 +  ####################################################​ 
 +   
 +  design <- model.matrix(~Treat,​ data = targets) 
 +  colnames(design) 
 +   
 +  Y <- DGEList(counts=Expression,​ gene=Gene_Symbol) 
 +  Y <- estimateDisp(Y,​design) 
 +  #Y <- estimateGLMCommonDisp(Y,​design) 
 +  #Y <- estimateGLMTrendedDisp(Y,​design) 
 +  #Y <- estimateGLMTagwiseDisp(Y,​design) 
 +  fit <- glmFit(Y, design) 
 +  colnames(fit) 
 +   
 +  ## Get normalized values 
 +  TMM <- calcNormFactors(Y,​ method="​TMM"​)  
 +  TMM <- cpm(TMM, normalized.lib.sizes=TRUE,​ log=T) 
 +   
 +  ### 1-way ANODEV based apporach for testing Stage effects 
 +  ## female vs. male 
 +  Result_anodev <- glmLRT(fit, coef=2) 
 +  Result_anodev_table <- topTags(Result_anodev,​ n=dim(Expression)[1],​ sort.by="​none"​)$table 
 +  sum(Result_anodev_table$FDR < 0.05) #2 
 +  head(Result_anodev_table)  
 +   
 +  #to visualize the two significant genes... 
 +  head(Result_anodev_table[order(Result_anodev_table$FDR),​]) #order the table by FDR value (ascending) 
 +  Gene_name[1011] 
 +  Gene_name[5473] 
 +   
 +  #quick view of histogram of p-values 
 +  hist(Result_anodev_table$PValue) 
 +  hist(Result_anodev_table$PValue,​ main = "​P-value Distribution Male vs. Female",​ xlab= "​P-values",​ ylab="​Count"​) 
 +   
 +  Final_anodev_table <- cbind (Gene_name,​Result_anodev_table) #add the matching gene names to the anodev table 
 +   
 +  ### save the final table results to Result_anodev.txt 
 +  write.table( Final_anodev_table,"​Result_anodev.txt",​ row.names=F,​sep="​\t"​ ) 
 +   
 +  ####################################################​ 
 +  ###    What to do when you have NO REPLICATES? ​  ### 
 +  ####################################################​ 
 +  ### 1. Be satisfied with a descriptive analysis such as fold change. Do not attempt a significance analysis. 
 +  ### 2. Simply pick a reasonable dispersion value, based on the experience or study design. 
 +  bcv <- 0.2  #BCV is square-root-dispersion 
 +  counts <- matrix( rnbinom(40,​size=1/​bcv^2,​mu=10),​20,​2 ) 
 +  y <- DGEList( counts, group=1:2 ) 
 +  y <- exactTest( y, dispersion = bcv^2 ) 
 +  y$table 
 +   
 +  #######################​ EOF #######################​
2.rnaseq.txt · Last modified: 2017/12/15 15:40 by hyjeong