# SUMMARY
# -------
# This file defines some functions that I use for QTL mapping in
# GEMMA. Here is an overview of the functions defined in this file:
#
#	prepare.pheno(pheno)
#	remove.outliers(pheno,phenotype,covariates,outliers,verbose)
#	read.pheno.pl(file)
#	computemaf(geno)
#	none.missing.row(x)
#	center.columns(X)
#	repmat(A,m,n)

# FUNCTION DEFINITIONS
# ----------------------------------------------------------------------
# This function prepares the phenotype data for QTL mapping.
prepare.pheno <- function (pheno) {

# Remove rows marked as "discard" and as possible sample mixups.
  pheno <- subset(pheno,discard == "no" & mixup == "no")
  
#Scale Centroid size skull to mean zero and std 1.
  pheno <- transform(pheno,sCS = (sCS - mean(sCS, na.rm=TRUE))/sqrt(var(sCS, na.rm=TRUE)))

#Scale Centroid size mandible new phenotypes to mean zero and std 1.  
  pheno <- transform(pheno,mCS = (mCS - mean(mCS, na.rm=TRUE))/sqrt(var(mCS, na.rm=TRUE)))

  # Return the updated phenotype data table.
  return(pheno)
  }

 # ----------------------------------------------------------------------
# Remove outliers from the phenotype data for the given phenotype,
# optionally conditioning on covariates. If covariates are specified,
# outliers are determined according to the residual of the phenotype
# after regressing on the covariates.
remove.outliers <- function (pheno, phenotype, covariates, outliers,
                             verbose = TRUE) {

  # Specify the function for removing the outliers.
  is.outlier <- function (x) {
    y           <- outliers(x)
    y[is.na(y)] <- FALSE
    return(y)
  }
  
  # If we are conditioning on one or more covariates, get the
  # residuals of the phenotype conditioned on these covariates.
  if (length(covariates) > 0) {
    f <- formula(paste(phenotype,"~",paste(covariates,collapse="+")))
    r <- resid(lm(f,pheno,na.action = na.exclude))
  } else
    r <- pheno[[phenotype]]

  # If requested, report how many outlying data points are removed.
  if (verbose) {
    n <- sum(is.outlier(r))
    if (n == 0)
      cat("No outliers for",phenotype)
    else
      cat(n,"outliers for",phenotype)
    if (length(covariates) == 0)
      cat(" are removed.\n")
    else
      cat(" conditioned on",paste(covariates,collapse=" + "),"are removed.\n")
  }
    
  # Remove the outlying data points.
  pheno[is.outlier(r),phenotype] <- NA

  # Return the updated phenotype data table.
  return(pheno)
}


#------------------------------------------------------------------------
# Returns a data frame containing the Ploen phenotype stored in a CSV file

read.pheno.pl <- function (file) {
  pheno <- read.csv(file,header = TRUE,quote = "",check.names = FALSE,
                    stringsAsFactors = FALSE,skip = 5)

# Convert several columns to factors.
  pheno <- transform(pheno,
                     id            = as.character(id),
                     #round         = factor(round,paste0("SW",1:25)),
                     discard       = factor(discard),
                     mixup         = factor(mixup))
					 
					 
  return(pheno)
}

# ----------------------------------------------------------------------
# Returns the minor allele frequency given a vector of genotypes
# encoded as allele counts.
computemaf <- function (geno) {
  f <- mean(geno,na.rm = TRUE)/2
  return(min(f,1-f))
}

# ----------------------------------------------------------------------
# For each row of the matrix or data frame, returns true if all the
# entries in the row are provided (not missing).
none.missing.row <- function (x)
    rowSums(is.na(x)) == 0
	
# ----------------------------------------------------------------------
# Centers the columns of matrix X so that the entries in each column
# of X add up to zero.
center.columns <- function (X) {
  mu <- matrix(colMeans(X),1,ncol(X))
  X  <- X - repmat(mu,nrow(X),1)
  return(X)
}


# ----------------------------------------------------------------------
# Does the same thing as repmat(A,m,n) in MATLAB.
repmat <- function (A,m,n) {
  return(kronecker(matrix(1,m,n),A))
}