####################################################################################
################################ RNA-Seq Data ######################################
####################################################################################
# Transcriptome analysis reveals dysregulation of innate immune response genes and neuronal activity-dependent genes in autism
# Simone Gupta, Shannon E. Ellis, Foram N. Ashar, Anna Moes, Joel S. Bader, Jianan Zhan, Andrew B. West & Dan E. Arking
# Nature Communications (Dec 2014)
# any questions please email sellis18 at jhmi dot edu 

####################################################################################
################################# Data Files #######################################
####################################################################################

#### Gene Expression Files ####
	## Raw data: genetable
		# Values are read count values generated from HTSeq
		# Note: Outlier Samples ARE Included in this file (N=120)
		# See code below for processing carried out on this file
		# I've also included the following file: P10.70.19.GC.GeneLength -- used for normalization 
		
	## EDASeq Normalized Data w/ outliers removed : Samples104BGenes-EDASeqFull
		# These are the 104 RNA-Seq files analyzed in our paper
		# Note : These are on the log2 scale

#### Phenotype File ####
	# samples in this file are in the same order as the phenotype file: Samples104.EDASeqFullBrainFeatures
	# includes covariates (ie age, sex, site), PCs/ISVs generated on these data and sequencing metrics
	# column "ID" has unique sample id (samples named: "brain region.sample number")

####################################################################################
################## EDASeq normalized per gene outlier removed  ########################
####################################################################################
source("http://bioconductor.org/biocLite.R")
biocLite("EDASeq")
library(EDASeq)		#v1.6 was used in the paper

# read in gene expression data (N=120)
geneexpression <- read.table("genetable", header = T, row.names = 1)

# remove sample outliers
geneexpression <- geneexpression[-c(1, 2, 5, 7, 17,21,  26, 54, 58, 83, 87, 92, 94, 95,117, 118)] #################10 samples + 5 samples + 1 (N=104)
Humanfeature <- read.table("P10.70.19.GC.GeneLength", header = TRUE, row.names = 1)
Humanfeature$GC <- Humanfeature$GC/100
feData<- new("AnnotatedDataFrame", data=data.frame(Humanfeature))
data <- newSeqExpressionSet(counts=as.matrix(geneexpression),featureData=feData)
dataOffsetGC <- withinLaneNormalization(data,"GC", which="full",offset=FALSE)
dataOffset <- betweenLaneNormalization(dataOffsetGC, which="full",offset=FALSE)
Normdata<-  counts(dataOffset)
Normdata <- log2(Normdata+1) #############Normalized log2

# Genes for which we have read count > 10 (log expression across all samples
# remove NAs
genes <- t(Normdata)
getF <- rep('NULL', dim(genes)[2])
for(i in 1:dim(genes)[2]){
getF[i] <- (length(which(genes[,i ] > log2(10))) == dim(genes)[1])}
j  <-  genes[,which(getF == 'TRUE')]

# remove per-gene outliers
for(i in 1:dim(genes)[2]){
genes[,i][(abs(genes[,i] - mean(genes[,i], na.rm = T)) > 3 *sd(genes[,i], na.rm = T))] <- NA
}

# generating PCs
pcaSRN <- prcomp(t(j), center=TRUE, scale.=TRUE)

# metadata is the phenotype file from above (Samples104.EDASeqBrainFeatures.txt)
# generating ISVs
metadata <- read.table("Samples104.EDASeqFullBrainFeatures", header = T, row.names = 1)

library(isva)
pheno <- metadata$Dx.01
a<- isvaFn(t(j), pheno, ncomp = NULL)
b <- as.data.frame(a$isv)
for(i in 1:11){colnames(b)[i] <- paste("ISV", i, sep="")}