User Tools

Site Tools


2.rnaseq

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

2.rnaseq [2017/12/15 15:40] (current)
hyjeong created
Line 1: Line 1:
 +  ##### installing packages example (will not do in workshop) 
 +  tr() 
 +  setRepositories() 
 +  install.packages("psych"
 +  library(psych) 
 +  tr() 
 +  m <- matrix(1:16,ncol=4) 
 +  tr(m) 
 +  ##################################### 
 +   
 +  ################# 
 +  #  Basic plots  # 
 +  ################# 
 +  x11(width = 6, height = 6)   #code to open up a window in linux 
 +  counts <- matrix( rnorm(200,mean=0,sd=1),20,10 )   #make a normal count 
 +  boxplot( counts[,1],counts[,3],counts[,5],counts[,7],counts[,9],counts[,10] , col=c("black","blue","green","red","purple","yellow") ) 
 +  boxplot( counts[,1],counts[,3],counts[,5],counts[,7],counts[,9],counts[,10] , col=c("red","red","red","blue","blue","blue") ) 
 +  test_data <- c(1,2,2,3,7,1,2,3,10,11,25,34,3,6,7,9,9,10,10,10)  #make test data 
 +  hist(test_data) 
 +  test_data <- rnorm(500, mean = 0, sd = 1) 
 +  hist(test_data) 
 +  hist(test_data, main = "Histogram of Test Dataset", xlab = "Trait", ylab = "Individual Counts"
 +  dev.off() #or close the window 
 +   
 +  #실습 1교시 마무리. 
 +   
 +  ## Set the working directory (not needed for workshop) 
 +  #setwd("C:\\PATH\\") ## in pc 
 +  #setwd("/home/2.RNASeq_practice/5.HTseq_simulated"  ##in linux 
 +   
 +  ## Call necessary libraries (these are preinstalled [ex:install.packages("limma")]) 
 +  library(statmod) 
 +  library(locfit) 
 +  library(edgeR) 
 +  library(limma) 
 +  library(DESeq2) 
 +   
 +  ## note the bad samples (6 and 10) 
 +  ## save used samples' index to Sam_Index 
 +  Samp_Index <- c(1:5,7:9,11:23) 
 +   
 +  ## or you can do 
 +  Samp_Index <- c(1:23) 
 +  Samp_Index <- Samp_Index[-c(6,10)] 
 +   
 +  ## take out bad samples 
 +  meta_data <- read.table("meta_data_mRNA.txt", sep="\t", head = T) 
 +  File_list <- as.character(meta_data[,1]) 
 +  File_list <- File_list[Samp_Index] 
 +  head(File_list) #check first few component of File_list 
 +   
 +  ### Data Read (first file) 
 +  temp <- read.table(File_list[1], sep="\t", head=F) 
 +  temp <- temp[-((nrow(temp)-4):nrow(temp)), #remove last 5 lines 
 +   
 +  Gene_Symbol <- as.character(temp[,1]) 
 +  Expression <- data.frame(temp[,2]) 
 +   
 +  ## Merge matrix (read all other good sample files and append) 
 +  for(i in 2:length(File_list)){   #from file_list, 2nd to last files are called since first file is already called 
 +     temp <- read.table(File_list[i], sep="\t", head=F) 
 +     temp <- temp[-((nrow(temp)-4):nrow(temp)),] # remove last 5 lines same as the first file 
 +   
 +     Expression <- data.frame(Expression, temp[,2]) 
 +  } 
 +  colnames(Expression) <- File_list #samples(columns) are marked with their original file name 
 +  head(Expression) #check first few lines of Expression 
 +  dim(Expression) #check dimension of Expression 
 +  head(Gene_Symbol) 
 +   
 +  #### remove all zero gene for mRNA data  #from original 14218 genes 
 +  Removed_ID <- which(rowSums(Expression) == 0)   #1067 genes removed 
 +  Expression <- Expression[-Removed_ID,
 +  Gene_Symbol <- Gene_Symbol[-Removed_ID] 
 +  dim(Expression) 
 +  length(Gene_Symbol)   #13151 
 +   
 +  #### gene annotation ##### 
 +  ####after annotation file '#bin' delete 
 +  Gene_annotation <- read.table("workshop_practice.annotation",head=T, stringsAsFactors=FALSE) 
 +  Gene_annotation <- Gene_annotation[,c(2,13)]  ## get information we need: col_2 and col_3 
 +  #match Gene_name with Gene_Symbol 
 +  Gene_name <- c() 
 +  for ( i in 1:length(Gene_Symbol) ){ 
 +  Gene_name[i]<- Gene_annotation[match(Gene_Symbol[i],Gene_annotation[,1]),2] 
 +  } 
 +   
 +  ## Start RNAseq analysis 
 +  ######################################### 
 +  ### DEG analysis Set for 1-way Analysis 
 +  ######################################### 
 +  Treat <- factor( meta_data[,3], levels = c("male","female") ) 
 +  Treat <- Treat[Samp_Index]  #dimension correction 
 +  targets <- data.frame(Treat) 
 +   
 +  #################################################### 
 +  ### Get normalized TMM values 
 +  #################################################### 
 +   
 +  design <- model.matrix(~Treat, data = targets) 
 +  colnames(design) 
 +   
 +  Y <- DGEList(counts=Expression, gene=Gene_Symbol) 
 +  Y <- estimateDisp(Y,design) 
 +  #Y <- estimateGLMCommonDisp(Y,design) 
 +  #Y <- estimateGLMTrendedDisp(Y,design) 
 +  #Y <- estimateGLMTagwiseDisp(Y,design) 
 +  fit <- glmFit(Y, design) 
 +  colnames(fit) 
 +   
 +  ## Get normalized values 
 +  TMM <- calcNormFactors(Y, method="TMM")  
 +  TMM <- cpm(TMM, normalized.lib.sizes=TRUE, log=T) 
 +   
 +  ### 1-way ANODEV based apporach for testing Stage effects 
 +  ## female vs. male 
 +  Result_anodev <- glmLRT(fit, coef=2) 
 +  Result_anodev_table <- topTags(Result_anodev, n=dim(Expression)[1], sort.by="none")$table 
 +  sum(Result_anodev_table$FDR < 0.05) #2 
 +  head(Result_anodev_table)  
 +   
 +  #to visualize the two significant genes... 
 +  head(Result_anodev_table[order(Result_anodev_table$FDR),]) #order the table by FDR value (ascending) 
 +  Gene_name[1011] 
 +  Gene_name[5473] 
 +   
 +  #quick view of histogram of p-values 
 +  hist(Result_anodev_table$PValue) 
 +  hist(Result_anodev_table$PValue, main = "P-value Distribution Male vs. Female", xlab= "P-values", ylab="Count"
 +   
 +  Final_anodev_table <- cbind (Gene_name,Result_anodev_table) #add the matching gene names to the anodev table 
 +   
 +  ### save the final table results to Result_anodev.txt 
 +  write.table( Final_anodev_table,"Result_anodev.txt", row.names=F,sep="\t"
 +   
 +  #################################################### 
 +  ###    What to do when you have NO REPLICATES?   ### 
 +  #################################################### 
 +  ### 1. Be satisfied with a descriptive analysis such as fold change. Do not attempt a significance analysis. 
 +  ### 2. Simply pick a reasonable dispersion value, based on the experience or study design. 
 +  bcv <- 0.2  #BCV is square-root-dispersion 
 +  counts <- matrix( rnbinom(40,size=1/bcv^2,mu=10),20,2 ) 
 +  y <- DGEList( counts, group=1:2 ) 
 +  y <- exactTest( y, dispersion = bcv^2 ) 
 +  y$table 
 +   
 +  ####################### EOF #######################
2.rnaseq.txt · Last modified: 2017/12/15 15:40 by hyjeong