require(multtest) # Required for FDR calculations # Main function # The datafile contains one line for each homology group # The columns are then defined as follows # Column 1: Group id # Column 2: Minimum paralog p-value for species 1 # Column 3: Number of homologs in the group for species 1 # Column 4: Number of homologs measured for species 1 # Column 5: Minimum paralog p-value species 2 # Column 6: Number of homologs in the group for species 2 # Column 7: Number of homologs measured for species 2 # # Example data files are available at # http://bioinformatics.math.chalmers.se/Xspecies/ # # Example: # xspecies(file="HS-8sp-twotails.txt") xspecies<-function(file="HS-8sp-twotails.txt", weights=rep(1, length(species.names)), species.groups=NULL){ # Read datafile data<-read.table(file, header=T, sep="\t") # One homology group on each line nGroups<-dim(data)[1] # Each species have three columns rem<-(dim(data)[2]-1) %% 3 if(rem!=0){ stop("Wrong number of columns!") } # Calcultes the number of species nSpecies<-(dim(data)[2]-1)/3 # Calculate the columns for p-values, number of homologenes and the # number of genes represented for each species pv.seq<-seq(1+1, 3*nSpecies+1, by=3) nobs.seq<-seq(1+2, 3*nSpecies+1, by=3) ngenes.seq<-seq(1+3, 3*nSpecies+1, by=3) # Extract the species names species.names<-unlist(strsplit(colnames(data)[pv.seq], "\\."))[seq(1,2*nSpecies, by=2)] cat(sprintf("Found the following species:%s\n", paste(species.names, collapse=", "))) # Init some vectors and matrices P.combined<-numeric(length=nGroups) S<-numeric(length=nGroups) alpha<-numeric(length=nGroups) nSp.avail<-numeric(length=nGroups) Y.matrix<-matrix(NA, nr=nGroups, nc=nSpecies) colnames(Y.matrix)<-paste("NormScore", species.names, sep="-") P.all<-matrix(nr=nGroups, nc=nSpecies) colnames(P.all)<-species.names # Loop over all homology groups for(i in 1:nGroups){ # Extract species-specific minium p-values Pvalues<-as.numeric(data[i,pv.seq]) # Save P-values P.all[i,]<-as.numeric(Pvalues) # What p-values are available (i.e. not NA) tmp<-!is.na(Pvalues) # Recalculate weights based on available species and species rgoups if(sum(tmp)>0){ # Do we have data for at least one species # Number of available species nSp.avail[i]<-sum(!is.na(Pvalues)) # Has the species group paramter been supplied if(!is.null(species.groups)){ # Extract the different species groups sp.groups<-unique(species.groups) # Extarct the number of species groups nsp.groups<-length(sp.groups) # Loop over the speices groups for(j in 1:nsp.groups){ tmp.sp<-species.groups==sp.groups[j] if(sum(tmp.sp)>1){ # Species groups with more than one # species may need to be adjusted # depending on their availability tmp.avail<-(tmp&tmp.sp) # Number of species available in # the species group if(sum(tmp.avail)>0){ # Do we have any species available? # Adjust weights to the available species without # changing the weight of the group weights.group[tmp.avail]<-weights.group[tmp.avail]/sum(weights.group[tmp.avail])*sum(weights.group[tmp.sp]) } } } } else { weights.group<-weights } # Recalculate weights based on species available for the # homology group weights.avail<-weights.group[tmp]/sum(weights.group[tmp]) # Equal weight # Calculate the number of paralogs for each species nparalogs<-data[i,nobs.seq][tmp] # Pick out the corresponding number of paralogs # Calculate Y (-log min P-values) Y<-(-log(Pvalues))[tmp] # Calculate the expected value ExpectedValue<-sapply(nparalogs, CalculateExpectedValue) # Calculate the expected value # Scale Y Y.scaled<-Y/ExpectedValue # Y/Exp[Y] # Save Y for output Y.matrix[i,tmp]<-as.numeric(Y.scaled) # Calculate S as the weighted sum of the scaled Ys S[i]<-sum(Y.scaled*weights.avail, na.rm=T) # Parameter for the approximate gamma distribution alpha[i]<-CalculateAlpha(nparalogs, weights.avail) # Calculate the cross-species specific p-value P.combined[i]<-pgamma(S[i], shape=alpha[i], rate=alpha[i], lower.tail=F) } else { # No observation for this homology group S[i]<-NA nSp.avail[i]<-0 alpha[i]<-NA P.combined[i]<-NA } } # Calculate FDR based on the multtest package tmp<-mt.rawp2adjp(P.combined, proc="BH") # FDR correction FDR.combined<-tmp$adjp[,"BH"][order(tmp$index)] # Combine everything for the output Result<-data.frame(Group=data[,1], nSpecies=nSp.avail, S=S, alpha=alpha, Variance=1/alpha, Pvalue=P.combined, FDR=FDR.combined, P.all, Y.matrix, data[,nobs.seq], data[,ngenes.seq]) return(Result) } CalculateExpectedValue<-function(n){ n<-as.numeric(n) if(is.na(n)){ return(NA) } if(n>0){ return(sum(1/(1:n))) } else { return(NA) } } CalculateVariance<-function(n){ if(is.na(n)){ return(NA) } if(n>0){ return(sum(1/(1:n)^2)) } else { return(NA) } } CalculateAlpha<-function(nparalogs, weights){ nspecies<-sum(nparalogs>0) ExpectedValues<-sapply(nparalogs, CalculateExpectedValue) Variances<-sapply(nparalogs, CalculateVariance) alpha<-1/(sum(weights^2*(1/(ExpectedValues^2))*Variances, na.rm=T)) return(alpha) } # Method for directed test. Uses two files, one for each tail. xspecies.directed<-function(file.up="HS-8sp-onetail-up.txt", file.down="HS-8sp-onetail-down.txt", sort=FALSE, ...){ res.up<-xspecies(file.up, ...) res.down<-xspecies(file.down, ...) p.up<-res.up[,"Pvalue"] p.down<-res.down[,"Pvalue"] # Picks the most significant p-value tmp<-p.down