Added reference argument to filters, updated docs.

Aaron Lun · Aaron Lun · commit c70cfd68b543 · 2015-06-27T04:02:54.000Z
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/diffHic@105563 bc3139a8-67e5-0310-9ffc-ced21a209358
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: diffHic
-Version: 1.1.8
-Date: 2015/06/21
+Version: 1.1.9
+Date: 2015/06/27
 Title: Differential analyis of Hi-C data
 Author: Aaron Lun <alun@wehi.edu.au>
 Maintainer: Aaron Lun <alun@wehi.edu.au>
diff --git a/R/filters.R b/R/filters.R
@@ -1,27 +1,54 @@
-filterDirect <- function(data, ...)
+filterDirect <- function(data, prior.count=2, reference=NULL)
 # Implements the direct filtering method on the abundances of 
-# inter-chromosomal bin pairs.
+# inter-chromosomal bin pairs. Also allows for specification of
+# a reference set of bin pairs (usually larger bins from which
+# the abundances can be more stably computed).
 #
 # written by Aaron Lun
 # created 5 March 2015
-# last modified 20 March 2015
+# last modified 24 June 2015
 {
+	if (!is.null(reference)) { 
+		actual.ab <- scaledAverage(asDGEList(data), prior.count=prior.count)
+		ref <- Recall(reference, prior.count=prior.count)
+
+		stopifnot(identical(reference$totals, data$totals))
+		scaling <- (.getBinSize(reference)/.getBinSize(data))^2
+		adj.thresh <- .repriorAveLogCPM(ref$threshold, totals=data$totals,
+			prior.count=prior.count, scaling=scaling)
+		return(list(abundances=actual.ab, threshold=adj.thresh, ref=ref))
+	}
+
 	all.chrs <- seqnames(regions(data))
 	is.inter <- as.logical(all.chrs[anchors(data, id=TRUE)]!=all.chrs[targets(data, id=TRUE)])
-	ave.ab <- scaledAverage(asDGEList(data), ...)
+	ave.ab <- scaledAverage(asDGEList(data), prior.count=prior.count)
 
-	threshold <- .getInterThreshold(all.chrs, ave.ab[is.inter], empty=.makeEmpty(data, ...))
+	threshold <- .getInterThreshold(all.chrs, ave.ab[is.inter],
+		empty=.makeEmpty(data, prior.count=prior.count))
 	return(list(abundances=ave.ab, threshold=threshold))
 }
 
-.getInterThreshold <- function(all.chrs, inter.ab, empty=NA) { 
-	# Getting the total number of inter-chromosomal bins.
+.getBinSize <- function(data) 
+# Gets the bin size in base pairs. This should be easy for bin pairs,
+# but we also allow for more exotic set-ups, e.g., Capture-C loaded
+# with regionCounts where anchors are bins around probes (evenly
+# sized so treatable as bin pairs, but irregularly spaced).
+{
+	out <- exptData(data)$width
+	if (is.null(out)) { out <- median(regions(data)) }
+	return(out) 
+}
+
+.getInterThreshold <- function(all.chrs, inter.ab, empty=NA) 
+# Computes the threshold from inter-chromosomal interactions.
+# First we get the total number of inter-chromosomal bins,
+# and then we compue the median (accounting for those lost).
+{ 
 	n.bins <- as.numeric(runLength(all.chrs))
 	total.bins <- sum(n.bins)
 	n.inter <- total.bins * (total.bins + 1L)/2L - sum(n.bins * (n.bins + 1L)/2L)
 	prop.kept <- length(inter.ab)/n.inter
 
-	# Getting the threshold.
 	if (prop.kept >= 1) { 
 		threshold <- median(inter.ab) 
 	} else if (prop.kept < 0.5) { 
@@ -35,20 +62,43 @@ filterDirect <- function(data, ...)
 
 .makeEmpty <- function(data, ...) { scaledAverage(DGEList(rbind(integer(ncol(data))), lib.size=data$totals), ...) }
 
-filterTrended <- function(data, span=0.25, ...)
+.repriorAveLogCPM <- function(AveLogCPM, totals, prior.count, scaling)
+# Adjusting the average log-CPM to use a new prior count.
+{
+	ave.count <- 2^AveLogCPM * mean(totals) / 1e6
+	ave.count <- ave.count + prior.count*(scaling - 1)
+	return(log2(ave.count * 1e6 / mean(totals) / scaling))
+}
+
+filterTrended <- function(data, span=0.25, prior.count=2, reference=NULL)
 # Implements the trended filtering method on the abundances of 
-# inter-chromosomal bin pairs. 
+# inter-chromosomal bin pairs. Again, with allowances for a reference set.
 #
 # written by Aaron Lun
 # created 5 March 2015
-# last modified 20 March 2015
+# last modified 24 June 2015
 {
+	if (!is.null(reference)) {
+		actual.ab <- scaledAverage(asDGEList(data), prior.count=prior.count)
+		actual.dist <- log10(getDistance(data, type="mid") + .getBinSize(data))
+		ref <- Recall(reference, span=span, prior.count=prior.count)
+		
+		new.threshold <- approx(x=ref$log.distance, y=ref$threshold, xout=actual.dist, rule=2)$y
+		new.threshold[is.na(actual.dist)] <- ref$threshold[is.na(ref$log.distance)][1] # Direct threshold.
+
+		stopifnot(identical(reference$totals, data$totals))
+		scaling <- (.getBinSize(reference)/.getBinSize(data))^2
+		adj.thresh <- .repriorAveLogCPM(new.threshold, totals=data$totals,
+			prior.count=prior.count, scaling=scaling)
+		return(list(abundances=actual.ab, threshold=adj.thresh, log.distance=actual.dist, ref=ref)) 
+	}
+
 	dist <- getDistance(data, type="mid")
-	log.dist <- log10(dist + exptData(data)$width)
-	ave.ab <- scaledAverage(asDGEList(data), ...)
+	log.dist <- log10(dist + .getBinSize(data))
+	ave.ab <- scaledAverage(asDGEList(data), prior.count=prior.count)
 
 	# Filling in the missing parts of the interaction space.
-	empty <- .makeEmpty(data, ...)
+	empty <- .makeEmpty(data, prior.count=prior.count)
 	is.intra <- !is.na(log.dist)
 	n.intras <- sum(is.intra)
 	all.chrs <- seqnames(regions(data))
@@ -66,7 +116,7 @@ filterTrended <- function(data, span=0.25, ...)
 		extra.dist <- .Call(cxx_get_missing_dist, cumsum(runLength(all.chrs)),
 			a.pts-1L, t.pts-1L, (start(regions(data))+end(regions(data)))/2)
 		if (is.character(extra.dist)) { stop(extra.dist) }
-		extra.dist <- log10(extra.dist + exptData(data)$width)
+		extra.dist <- log10(extra.dist + .getBinSize(data))
 		trend.threshold <- loessFit(x=c(log.dist, extra.dist), 
 			y=c(ave.ab, rep(empty, length(extra.dist))), 
 			span=span)$fitted[1:length(log.dist)]
diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd
@@ -2,7 +2,7 @@
 \title{diffHic News}
 \encoding{UTF-8}
 
-\section{Version 1.1.7}{\itemize{
+\section{Version 1.1.9}{\itemize{
 \item
 Added library size specification to DIList methods normalize(), asDGEList().
 
@@ -42,6 +42,9 @@ Switched to reporting ranges directly from boxPairs(), added support for minimum
 \item
 Modified consolidatePairs() to accept index vectors for greater modularity.
 
+\item 
+Added reference argument for large bin pairs, in filterDirect() and filterTrended().
+
 \item 
 Updated documentation, tests and user's guide.
 }}
diff --git a/man/DNaseHiC.Rd b/man/DNaseHiC.Rd
@@ -37,6 +37,7 @@ Also, invalidity of chimeras is determined by checking whether the 3' end is mor
 The size of the pseudo-fragments is determined by, well, \code{size} in \code{segmentGenome}.
 Smaller sizes provide better resolution but increase computational work.
 Needless to say, the \code{param$fragments} field should contain the output from \code{segmentGenome}, rather than from \code{\link{cutGenome}}.
+Also see \code{\link{cutGenome}} documentation for a warning about the chromosome names.
 
 Some loss of spatial resolution is inevitable when reads are summarized into pseudo-fragments.
 This is largely irrelevant, though, as counting across the interaction space will ultimately use much larger bins (usually at least 2 kbp).
diff --git a/man/cutGenome.Rd b/man/cutGenome.Rd
@@ -33,6 +33,13 @@ Nonetheless, they are still reported to maintain the correspondence between frag
 Cleavage sites on the forward strand can be obtained as the \code{start} locations of all fragments (excepting the first fragment on each chromosome).
 }
 
+\section{Warning}{
+If \code{bs} is a FASTQ file, the chromosome names in the FASTQ headers will be loaded faithfully by \code{cutGenome}.
+However, many mapping pipelines will drop the rest of the name past the first whitespace when constructing the alignment index.
+To be safe, users should ensure that the chromosome names in the FASTQ headers consist of one word.
+Otherwise, there will be a discrepancy between the chromosome names in the output \code{GRanges}, and those in the BAM files after alignment.
+}
+
 % Interpretations of consecutive sites is generally tricky.
 % For starters, the 'remainder' is so low that the strands are unlikely to stay stuck together until the fill-in step.
 % This becomes an impossibility if remainder is zero, such that ssDNA is formed after cleavage of consecutive sites.
diff --git a/man/filters.Rd b/man/filters.Rd
@@ -6,14 +6,15 @@
 \description{Implementations of the direct and trended filtering strategies for bin pair abundances.}
 
 \usage{
-filterDirect(data, ...)
-filterTrended(data, span=0.25, ...)
+filterDirect(data, prior.count=2, reference=NULL)
+filterTrended(data, span=0.25, prior.count=2, reference=NULL)
 }
 
 \arguments{
 \item{data}{a \code{DIList} object produced by \code{\link{squareCounts}}}
 \item{span}{a numeric scalar specifying the bandwidth for loess curve fitting}
-\item{...}{other arguments to be passed to \code{\link{scaledAverage}}}
+\item{prior.count}{a numeric scalar indicating the prior count to use for calculating the average abundance}
+\item{reference}{another \code{DIList} object, usually containing data for larger bin pairs}
 }
 
 \details{
@@ -32,14 +33,20 @@ Lower values may need to be used for a more accurate fit when the trend is highl
 The bin size is also added to the distance prior to log-transformation, to avoid problems with undefined values when distances are equal to zero.
 Empty parts of the interaction space are considered by inferring the abundances and distances of the corresponding bin pairs (though this is skipped if too much of the space is empty).
 
-The \code{scale} argument can be passed to \code{\link{scaledAverage}}, in order to handle comparisons between bin pairs of different sizes.
-Check out the user's guide for more details.
+If \code{reference} is specified, it will be used to compute filter thresholds instead of \code{data}.
+This is intended for large bin pairs that have been loaded with \code{filter=1}.
+Larger bins provide larger counts for more precise threshold estimates, while the lack of filtering ensures that estimates are not biased.
+All threshold estimates are adjusted to account for differences in bin sizes between \code{reference} and \code{data}.
+The final values can be used to directly filter on abundances in \code{data}; check out the user's guide for more details.
 }
 
 \value{
 A list is returned containing \code{abundances}, a numeric vector with the average abundances of all bin pairs in \code{data}.
 For \code{filterDirect}, the list contains a numeric scalar \code{threshold}, i.e., the non-specific ligation rate.
-For \code{filterTrended}, the list contains \code{threshold}, a numeric vector containing the threshold for each bin pair; and \code{log.distances}, a numeric vector with the log-distances for each bin pair.
+For \code{filterTrended}, the list contains \code{threshold}, a numeric vector containing the threshold for each bin pair; and \code{log.distance}, a numeric vector with the log-distances for each bin pair.
+
+If \code{reference} is specified in either function, an additional list named \code{ref} is also returned.
+This contains the filtering information for the bin pairs in \code{reference}, same as that reported above for each bin pair in \code{data}.
 }
 
 \seealso{
@@ -73,6 +80,24 @@ y[keep,]
 trended <- filterTrended(y)
 keep <- trended$abundances > trended$threshold 
 y[keep,]
+
+# Running reference comparisons, using larger bin pairs.
+w <- 5L
+a2 <- a/w
+b2 <- b/w
+regions2 <- GRanges(rep(c("chrA", "chrB"), c(a2, b2)), 
+    IRanges(c(1:a2, 1:b2)*w-w+1L, c(1:a2, 1:b2)*w))
+npairs2 <- 20
+all.anchors2 <- sample(length(regions2), npairs2, replace=TRUE)
+all.targets2 <- as.integer(runif(npairs2, 1, all.anchors2+1))
+y2 <- DIList(matrix(rnbinom(npairs2*4, mu=10*w^2, size=10), npairs2, 4), 
+    anchors=all.anchors2, targets=all.targets2, regions=regions2, 
+    totals=y$totals, exptData=List(width=w))
+
+direct2 <- filterDirect(y, reference=y2)
+sum(direct2$abundances > direct2$threshold + log2(1.5))
+trended2 <- filterTrended(y, reference=y2)
+sum(trended2$abundances > trended2$threshold)
 }
 
 \references{