Rscripts: Difference between revisions

From software
Jump to navigation Jump to search
 
(6 intermediate revisions by the same user not shown)
Line 9: Line 9:
##maxMis maximum allowed missing genotypes for a site
##maxMis maximum allowed missing genotypes for a site


   nMis<-rowSums(is.na(geno))
   nMis <- rowSums(is.na(geno))
   avg<-rowMeans(geno,na.rm=T)              # get allele frequency times 2
   freq <- rowMeans(geno,na.rm=T)/2               # get allele frequency  
   keep<-avg>minMaf&avg<2*(1-minMaf)& nMis<=maxMis        # remove sites with non-polymorphic data
   keep <- freq>minMaf&freq<(1-minMaf) & nMis<=maxMis        # remove sites with non-polymorphic data
   avg<-avg[keep]
   freq<-freq[keep]
   geno<-geno[keep,]
   geno<-geno[keep,]
   snp<-nrow(geno)                          #number of snps used in analysis
   snp<-nrow(geno)                          #number of snps used in analysis
   ind<-ncol(geno)                          #number of individuals used in analuysis
   ind<-ncol(geno)                          #number of individuals used in analuysis
  freq<-avg/2                              #frequency
   M <- (geno-freq*2)/sqrt(freq*(1-freq))      #normalize the genotype matrix
   M <- (geno-avg)/sqrt(freq*(1-freq))      #normalize the genotype matrix
   M[is.na(M)]<-0
   M[is.na(M)]<-0
   X<-t(M)%*%M                              #get the (almost) covariance matrix
   X<-t(M)%*%M                              #get the (almost) covariance matrix
Line 45: Line 44:
Example
Example
<pre>
<pre>
ind<-c(20,20)
ind<-c(100,100,100)
snp<-10000
snp<-10000
freq=c(0.2,0.25)
freq<-cbind(runif(snp),runif(snp),runif(snp))
geno<-c()
l<-lapply(1:length(ind),function(x) matrix(rbinom(snp*ind[x],2,freq[,x]),snp))
for(pop in 1:length(ind))
geno<-do.call(cbind,l)
  geno<-rbind(geno,matrix(rbinom(snp*ind[pop],2,freq[pop]),ind[pop]))
geno<-t(geno)
e<-eigenstrat(geno)
e<-eigenstrat(geno)
plot(e,col=rep(1:length(ind),ind),xlab="PC1",ylab="PC2")
plot(e,col=rep(1:length(ind),ind),xlab="PC1",ylab="PC2")
</pre>
</pre>
Line 63: Line 59:
<pre>
<pre>
WC84<-function(x,pop){
WC84<-function(x,pop){
x<-x[,!apply(is.na(x),2,any)]
  popNam<-sort(unique(pop))
   #number ind each population
   #number ind each population
   n<-table(pop)
   n<-table(pop)
Line 72: Line 72:
   N<-length(pop)
   N<-length(pop)
   #frequency in samples
   #frequency in samples
   p<-apply(x,2,function(x,pop){tapply(x,pop,mean)/2},pop=pop)
   p<-t(sapply(popNam,function(y) colMeans(x[pop==y,])/2))
# p<-apply(x,2,function(x,pop){tapply(x,pop,mean)/2},pop=pop)
   #average frequency in all samples (apply(x,2,mean)/2)
   #average frequency in all samples (apply(x,2,mean)/2)
   p_avg<-as.vector(n%*%p/N )
   p_avg<-as.vector(n%*%p/N )
Line 112: Line 113:
pop<-rep(1:4,40)
pop<-rep(1:4,40)
res<-WC84(x,pop)
res<-WC84(x,pop)
res$theta
res$theta_w
</pre>
</pre>



Latest revision as of 15:25, 12 March 2015

PCA/Eigensoft/Eigenstrat

eigenstrat<-function(geno,maxMis=0,minMaf=0.01){                 
## geno: snp x ind matrix of genotypes \in 0,1,2
##maxMis maximum allowed missing genotypes for a site

  nMis <- rowSums(is.na(geno))
  freq <- rowMeans(geno,na.rm=T)/2               # get allele frequency 
  keep <- freq>minMaf&freq<(1-minMaf) & nMis<=maxMis         # remove sites with non-polymorphic data
  freq<-freq[keep]
  geno<-geno[keep,]
  snp<-nrow(geno)                           #number of snps used in analysis
  ind<-ncol(geno)                           #number of individuals used in analuysis
  M <- (geno-freq*2)/sqrt(freq*(1-freq))       #normalize the genotype matrix
  M[is.na(M)]<-0
  X<-t(M)%*%M                               #get the (almost) covariance matrix
  X<-X/(sum(diag(X))/(snp-1))
  E<-eigen(X)

  mu<-(sqrt(snp-1)+sqrt(ind))^2/snp         #for testing significance (assuming no LD!)
  sigma<-(sqrt(snp-1)+sqrt(ind))/snp*(1/sqrt(snp-1)+1/sqrt(ind))^(1/3)
  E$TW<-(E$values[1]*ind/sum(E$values)-mu)/sigma
  E$mu<-mu
  E$sigma<-sigma
  E$nSNP <- nrow(geno)
  E$nInd <- ncol(geno)
  class(E)<-"eigenstrat"
  E
}



plot.eigenstrat<-function(x,col=1,...)
  plot(x$vectors[,1:2],col=col,...)

print.eigenstrat<-function(x)
  cat("statistic",x$TW,"\n")

Example

ind<-c(100,100,100)
snp<-10000
freq<-cbind(runif(snp),runif(snp),runif(snp))
l<-lapply(1:length(ind),function(x) matrix(rbinom(snp*ind[x],2,freq[,x]),snp))
geno<-do.call(cbind,l)
e<-eigenstrat(geno)
plot(e,col=rep(1:length(ind),ind),xlab="PC1",ylab="PC2")

Fst

The same as the faster "fstat" from the geneland package but this script also gives the total variance, variance within individuals, variance within population and variance between populations both on a SNP level and on as a joint estimate.

WC84<-function(x,pop){
x<-x[,!apply(is.na(x),2,any)]

  popNam<-sort(unique(pop))

  #number ind each population
  n<-table(pop)
  #number of populations
  npop<-nrow(n)
  #average sample size of each population
  n_avg<-mean(n)
  #total number of samples
  N<-length(pop)
  #frequency in samples
  p<-t(sapply(popNam,function(y) colMeans(x[pop==y,])/2))
 # p<-apply(x,2,function(x,pop){tapply(x,pop,mean)/2},pop=pop)
  #average frequency in all samples (apply(x,2,mean)/2)
  p_avg<-as.vector(n%*%p/N )
  #the sample variance of allele 1 over populations
  s2<-1/(npop-1)*(apply(p,1,function(x){((x-p_avg)^2)})%*%n)/n_avg
  #average heterozygouts
  #  h<-apply(x==1,2,function(x,pop)tapply(x,pop,mean),pop=pop)
  #average heterozygote frequency for allele 1
  #  h_avg<-as.vector(n%*%h/N)
  #faster version than above:
   h_avg<-apply(x==1,2,sum)/N
  #nc (see page 1360 in wier and cockerhamm, 1984)
  n_c<-1/(npop-1)*(N-sum(n^2)/N)
  #variance betwen populations
  a <- n_avg/n_c*(s2-(p_avg*(1-p_avg)-(npop-1)*s2/npop-h_avg/4)/(n_avg-1))
  #variance between individuals within populations
  b <- n_avg/(n_avg-1)*(p_avg*(1-p_avg)-(npop-1)*s2/npop-(2*n_avg-1)*h_avg/(4*n_avg))
  #variance within individuals
  c <- h_avg/2

  #inbreedning (F_it)
  F <- 1-c/(a+b+c)
  #(F_st)
  theta <- a/(a+b+c)
  #(F_is)
  f <- 1-c(b+c)
  #weigted average of theta
  theta_w<-sum(a)/sum(a+b+c)
  list(F=F,theta=theta,f=f,theta_w=theta_w,a=a,b=b,c=c,total=c+b+a)
}



#example of use
nsnp=10000
x<-matrix(rbinom(160*nsnp,2,0.02),160)
pop<-rep(1:4,40)
res<-WC84(x,pop)
res$theta_w

LD pruning in R

Install R package

wget http://www.popgen.dk/albrecht/misc_Rpackages/Rpakker/pruning_0.51.tar.gz
R CMD INSTALL pruning_0.51.tar.gz