#################################################################################### ################################ RNA-Seq Data ###################################### #################################################################################### # Transcriptome analysis reveals dysregulation of innate immune response genes and neuronal activity-dependent genes in autism # Simone Gupta, Shannon E. Ellis, Foram N. Ashar, Anna Moes, Joel S. Bader, Jianan Zhan, Andrew B. West & Dan E. Arking # Nature Communications (Dec 2014) # any questions please email sellis18 at jhmi dot edu #################################################################################### ################################# Data Files ####################################### #################################################################################### #### Gene Expression Files #### ## Raw data: genetable # Values are read count values generated from HTSeq # Note: Outlier Samples ARE Included in this file (N=120) # See code below for processing carried out on this file # I've also included the following file: P10.70.19.GC.GeneLength -- used for normalization ## EDASeq Normalized Data w/ outliers removed : Samples104BGenes-EDASeqFull # These are the 104 RNA-Seq files analyzed in our paper # Note : These are on the log2 scale #### Phenotype File #### # samples in this file are in the same order as the phenotype file: Samples104.EDASeqFullBrainFeatures # includes covariates (ie age, sex, site), PCs/ISVs generated on these data and sequencing metrics # column "ID" has unique sample id (samples named: "brain region.sample number") #################################################################################### ################## EDASeq normalized per gene outlier removed ######################## #################################################################################### source("http://bioconductor.org/biocLite.R") biocLite("EDASeq") library(EDASeq) #v1.6 was used in the paper # read in gene expression data (N=120) geneexpression <- read.table("genetable", header = T, row.names = 1) # remove sample outliers geneexpression <- geneexpression[-c(1, 2, 5, 7, 17,21, 26, 54, 58, 83, 87, 92, 94, 95,117, 118)] #################10 samples + 5 samples + 1 (N=104) Humanfeature <- read.table("P10.70.19.GC.GeneLength", header = TRUE, row.names = 1) Humanfeature$GC <- Humanfeature$GC/100 feData<- new("AnnotatedDataFrame", data=data.frame(Humanfeature)) data <- newSeqExpressionSet(counts=as.matrix(geneexpression),featureData=feData) dataOffsetGC <- withinLaneNormalization(data,"GC", which="full",offset=FALSE) dataOffset <- betweenLaneNormalization(dataOffsetGC, which="full",offset=FALSE) Normdata<- counts(dataOffset) Normdata <- log2(Normdata+1) #############Normalized log2 # Genes for which we have read count > 10 (log expression across all samples # remove NAs genes <- t(Normdata) getF <- rep('NULL', dim(genes)[2]) for(i in 1:dim(genes)[2]){ getF[i] <- (length(which(genes[,i ] > log2(10))) == dim(genes)[1])} j <- genes[,which(getF == 'TRUE')] # remove per-gene outliers for(i in 1:dim(genes)[2]){ genes[,i][(abs(genes[,i] - mean(genes[,i], na.rm = T)) > 3 *sd(genes[,i], na.rm = T))] <- NA } # generating PCs pcaSRN <- prcomp(t(j), center=TRUE, scale.=TRUE) # metadata is the phenotype file from above (Samples104.EDASeqBrainFeatures.txt) # generating ISVs metadata <- read.table("Samples104.EDASeqFullBrainFeatures", header = T, row.names = 1) library(isva) pheno <- metadata$Dx.01 a<- isvaFn(t(j), pheno, ncomp = NULL) b <- as.data.frame(a$isv) for(i in 1:11){colnames(b)[i] <- paste("ISV", i, sep="")}