#' Infer Entropy Signature
#'
#' Calculates genome-wide Shannon entropies from SNV data.
#'
#' @param polymorphisms  A data frame. Please see Details and Examples.
#' @param position Name of the \code{polymorphisms}'s column that indicates SNV
#'                 locations in the genome.
#' @param linkage Information on linked positions.
#' @param ref Column name with reference bases.
#' @param alt Column name with the alternative bases observed in the
#'            metagenome.
#' @param protein Name of the column carrying protein names.
#' @param aa_position Name of the column that indicates the protein positions
#'                    of the mutated amino acids.
#' @param ref_aa Name of the column that carries the reference amino acids.
#' @param alt_aa Name of the column carrying alternative amino acids observed
#'               in the metagenome.
#' @param alt_aa_freq Name of the column giving the frequencies of alternative
#'                 amino acids in the metagenome.
#' @param categories  Whether a class per amino acid should be used
#'        ("sensitive") or they should be grouped into aliphatic, aromatic,
#'        polar, positively charged, negatively charged, and special ("robust")
#'        (Mirny and Shakhnovich, 1999).
#' @param genome A list providing CDS data and length of the reference genome.
#'
#' @return An object of class \code{entropyProfile}. It contains a tidy,
#'         summarized version of the SNV table, a \code{data frame} with
#'         information on genome-wide entropy, a \code{data frame} with
#'         information on each CDS and corresponding mutations observed in the
#'         virome, and a \code{list} with CDS data and length of the reference
#'         genome used in variant calling.

 

#'
#' @details You provide a data frame with SNVs information including reference
#' and alternative aminoacids, their frequencies, and corresponding positions
#' relative to a reference sequence.
#' This type of data can be generated by numerous programs and pipelines.
#' The objective is to assess the biological impact of nonsynonymous
#' variation within a viral population, such as an environmental sample (e.g.
#' wastewater) or a single infection (aka quasisepecies).
#' Entropy is calculated \emph{within} the metagenome and is therefore independent
#' of the reference sequence.
#' Some mutations may be part of a same codon.
#' This is to be indicated in the \code{linkage} column, providing a downstream
#' linked position, or the closest upstream position if there are no downstream
#' positions that are part of the same codon.
#' For example, in the \code{wWater} dataset, mutations T22673C and C22674T are linked
#' to each other and affect codon 371 of the S gene:
#' \tabular{lccccccc}{
#' 	\tab	wave	\tab	position	\tab	linkage	\tab	ref	\tab	alt	\tab	protein	\tab ... \cr
#' ...	\tab		\tab			\tab		\tab		\tab		\tab		\tab     \cr
#' 105	\tab	third	\tab	22599		\tab	NA	\tab	G	\tab	A	\tab	S	\tab ... \cr
#' 106	\tab	third	\tab	22673		\tab	22674	\tab	T	\tab	C	\tab	S	\tab ... \cr
#' 107	\tab	third	\tab	22674		\tab	22673	\tab	C	\tab	T	\tab	S	\tab ... \cr
#' 108	\tab	third	\tab	22679		\tab	NA	\tab	T	\tab	C	\tab	S	\tab ... \cr
#' ...	\tab		\tab			\tab		\tab		\tab     \cr
#' }
#'
#' The \code{genome} parameter is a list that provides data on the topology of
#' protein-coding regions in the genome and its length, used internally
#' primarily for graphical and summary purposes.
#' The package provides an example (\code{\link{mn908947.3}}) of how  this
#' information is to be organized.
#'
#' @references
#'   Mirny and Shakhnovich, 1999. J Mol Biol 291:177-196.
#'   \doi{10.1006/jmbi.1999.2911}.
#'
#'   Shannon, 1948. Bell System Technical Journal, 27:379-423.
#'   \doi{10.1002/j.1538-7305.1948.tb01338.x}.
#'
#' @examples
#' 
#' # Entropy across the genome in ancestral lineages
#' ancestral <- getEntropySignature(wWater[wWater$wave == "first", ], categories = "sensitive")
#' 
#' # Inspect profile
#' plot(ancestral, chartType = "entroScan")
#'
#'
#' @export
#
getEntropySignature <- function(polymorphisms,
				position = "position",
				linkage = "linkage",
				ref = "ref",
				alt = "alt",
				protein = "protein",
				aa_position = "aa_position",
				ref_aa = "ref_aa",
				alt_aa = "alt_aa",
				alt_aa_freq = "alt_aa_freq",
			       	categories = "robust",
				genome = mn908947.3){
	#
	args <- as.list(environment())
	args$categories <- NULL
	perfil <- do.call(entropyProfile, args)
	#
	# loop position-wise across SNVs
	for(posicion in unique(polymorphisms[,position])){
		thisPosition <- polymorphisms[polymorphisms[,position] == posicion,]
		# c. Check if in a coding region
		if(!is.na(thisPosition[,ref_aa][1])){# Affects a coding region
			# d. Check if non-synonymous
			matchesToReference <- sum(match(thisPosition[alt_aa], thisPosition[ref_aa]), na.rm = T)
			comparisons <- length(match(thisPosition[alt_aa], thisPosition[ref_aa]))
			if(matchesToReference - comparisons < 0){# at least one AA is different than reference AA.
				perfil$Entropy$entropy[perfil$Entropy$position == posicion] <- getPosEntropy(fillPosition(createPositionSummary(thisPosition, ref_aa, alt_aa, alt_aa_freq), categories))
			}
			else{ 
				perfil$Entropy$entropy[perfil$Entropy$position == posicion] <- 0
			}
		}
		else{# non coding region
			perfil$Entropy$entropy[perfil$Entropy$position == posicion] <- NA
		}
	}
	return(perfil)
}
