Updated correctedContact, fixed filters.

Aaron Lun · Aaron Lun · commit d007ca851600 · 2015-07-19T01:41:30.000Z
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/diffHic@106535 bc3139a8-67e5-0310-9ffc-ced21a209358
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: diffHic
-Version: 1.1.10
-Date: 2015/07/13
+Version: 1.1.11
+Date: 2015/07/18
 Title: Differential analyis of Hi-C data
 Author: Aaron Lun <alun@wehi.edu.au>
 Maintainer: Aaron Lun <alun@wehi.edu.au>
diff --git a/R/correctedContact.R b/R/correctedContact.R
@@ -1,4 +1,5 @@
-correctedContact <- function(data, iterations=50, exclude.local=1, ignore.low=0.02, winsor.high=0.02, average=TRUE, dispersion=0.05)
+correctedContact <- function(data, iterations=50, exclude.local=1, ignore.low=0.02, winsor.high=0.02, 
+	average=TRUE, dist.correct=FALSE)
 # This performs the iterative correction method of Mirny et al. (2012) to
 # identify the true contact probability of each patch of the interaction
 # space. The idea is to use the true contact probability as a filter
@@ -9,20 +10,24 @@ correctedContact <- function(data, iterations=50, exclude.local=1, ignore.low=0.
 #
 # written by Aaron Lun
 # some time ago	
-# last modified 20 March 2015
+# last modified 18 July 2015
 {
 	if (!average & ncol(data)>1L) {
-		collected.truth <- collected.bias <- collected.max <- list()
+		collected.truth <- collected.bias <- collected.max <- collected.trend <- list()
 		for (lib in 1:ncol(data)) { 
 			out <- Recall(data[,lib], iterations=iterations, exclude.local=exclude.local, ignore.low=ignore.low, 
-				winsor.high=winsor.high, average=FALSE, dispersion=dispersion)
+				winsor.high=winsor.high, average=FALSE, dist.correct=dist.correct)
 			collected.truth[[lib]] <- out$truth
 			collected.bias[[lib]] <- out$bias
 			collected.max[[lib]] <- out$max
+			collected.trend[[lib]] <- out$trend
 		}
-		return(list(truth=do.call(cbind, collected.truth), 
+
+		output <- list(truth=do.call(cbind, collected.truth), 
 			bias=do.call(cbind, collected.bias),
-			max=do.call(cbind, collected.max)))
+			max=do.call(cbind, collected.max))
+		if (dist.correct) { output$trend <- do.call(cbind, collected.trend) }
+		return(output) 
 	}
 
 	# Checking arguments.
@@ -34,24 +39,35 @@ correctedContact <- function(data, iterations=50, exclude.local=1, ignore.low=0.
 	if (winsor.high >= 1) { stop("proportion of high coverage interactions to winsorize should be less than 1") }
 	exclude.local <- as.integer(exclude.local)
     
-	is.local <- !is.na(getDistance(data))
-   	log.lib <- log(data$totals)
-	if (length(log.lib)>1L) {
-		ave.counts <- exp(edgeR::mglmOneGroup(counts(data), offset=log.lib - mean(log.lib), dispersion=dispersion))
+	# Computing average counts, with or without distance correction.
+	if (dist.correct) { 
+		temp <- data
+		temp$totals <- 1e6 # need library size to be reflected in fitted value of trend.
+		trended <- filterTrended(temp, prior.count=0)
+		ave.counts <- 2^(trended$abundance - trended$threshold)
+		is.local <- !is.na(trended$log.distance)
 		nzero <- !is.na(ave.counts)
 	} else {
-		nzero <- counts(data) != 0L
-		ave.counts <- as.double(counts(data))
+		is.local <- !is.na(getDistance(data))
+   		log.lib <- log(data$totals)
+		if (length(log.lib)>1L) {
+			ave.counts <- exp(edgeR::mglmOneGroup(counts(data), offset=log.lib - mean(log.lib)))
+			nzero <- !is.na(ave.counts)
+		} else {
+			nzero <- counts(data) != 0L
+			ave.counts <- as.double(counts(data))
+		}
 	}
 
 	out<-.Call(cxx_iterative_correction, ave.counts[nzero], anchors(data, id=TRUE)[nzero], targets(data, id=TRUE)[nzero], 
 		is.local[nzero], length(regions(data)), iterations, exclude.local, ignore.low, winsor.high)
  	if (is.character(out)) { stop(out) }
-	full.truth <- rep(NA, length(nzero))
+	full.truth <- rep(0, length(nzero))
 	full.truth[nzero] <- out[[1]]
 	out[[1]] <- full.truth
 	
 	names(out) <- c("truth", "bias", "max")
+	if (dist.correct) { out$trend <- trended$threshold }
 	return(out)
 }
 
diff --git a/R/filters.R b/R/filters.R
@@ -35,7 +35,7 @@ filterDirect <- function(data, prior.count=2, reference=NULL)
 # sized so treatable as bin pairs, but irregularly spaced).
 {
 	out <- exptData(data)$width
-	if (is.null(out)) { out <- median(regions(data)) }
+	if (is.null(out)) { out <- median(width(regions(data))) }
 	return(out) 
 }
 
@@ -123,8 +123,11 @@ filterTrended <- function(data, span=0.25, prior.count=2, reference=NULL)
 	}
 
 	# Using the direct threshold.
-	direct.threshold <- .getInterThreshold(seqnames(regions(data)), ave.ab[is.na(log.dist)], empty=empty)
-	trend.threshold[is.na(log.dist)] <- direct.threshold
+	is.inter <- is.na(log.dist)
+	if (any(is.inter)) { 
+		direct.threshold <- .getInterThreshold(seqnames(regions(data)), ave.ab[is.inter], empty=empty)
+		trend.threshold[is.inter] <- direct.threshold
+	}
 	return(list(abundances=ave.ab, threshold=trend.threshold, log.distance=log.dist)) 
 }
 
diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd
@@ -2,7 +2,7 @@
 \title{diffHic News}
 \encoding{UTF-8}
 
-\section{Version 1.1.10}{\itemize{
+\section{Version 1.1.11}{\itemize{
 \item
 Added library size specification to DIList methods normalize(), asDGEList().
 
@@ -33,6 +33,9 @@ Added compartmentalize() function to identify genomic compartments.
 \item 
 Added domainDirections() function to help identify domains.
 
+\item
+Modified correctedContact() to allow distance correction and report factorized probabilities directly.
+
 \item 
 Modified marginCounts() function for proper single-end-like treatment of Hi-C data.
 
@@ -48,7 +51,6 @@ Modified consolidatePairs() to accept index vectors for greater modularity.
 \item 
 Added reference argument for large bin pairs, in filterDirect() and filterTrended().
 
-
 \item 
 Updated documentation, tests and user's guide.
 }}
diff --git a/inst/tests/test-itercor.R b/inst/tests/test-itercor.R
@@ -4,7 +4,7 @@
 suppressWarnings(suppressPackageStartupMessages(require(diffHic)))
 suppressPackageStartupMessages(require(edgeR))
 	
-comp<- function(npairs, nfrags, nlibs, lambda=5, dispersion=0.05, winsorize=0.02, discard=0.02, locality=1) {
+comp<- function(npairs, nfrags, nlibs, lambda=5, winsorize=0.02, discard=0.02, locality=1) {
 	all.pairs <- rbind(t(combn(nfrags, 2)), cbind(1:nfrags, 1:nfrags))
 	all.pairs <- data.frame(anchor.id=all.pairs[,2], target.id=all.pairs[,1])	
 	npairs <- min(npairs, nrow(all.pairs))
@@ -18,16 +18,19 @@ comp<- function(npairs, nfrags, nlibs, lambda=5, dispersion=0.05, winsorize=0.02
 	# Constructing the values.	
 	actual.mat<-matrix(0, nfrags, nfrags)
 	is.filled <- matrix(FALSE, nfrags, nfrags)
-	ave.count <- exp(mglmOneGroup(counts, offset=numeric(nlibs), dispersion=dispersion))
+	ave.count <- exp(mglmOneGroup(counts, offset=numeric(nlibs)))
 	for (x in 1:nrow(data)) { 
 		if (ave.count[x] < 1e-6) { next } # As zeros get removed.
 		a<-data@anchors[x]
 		t<-data@targets[x]
-		actual.mat[a,t] <- ave.count[x]
-		is.filled[a,t] <- TRUE
 		if (a!=t) { 
+			actual.mat[a,t] <- ave.count[x]
+			is.filled[a,t] <- TRUE
 			actual.mat[t,a] <- ave.count[x] 
 			is.filled[t,a] <- TRUE
+		} else {
+			actual.mat[a,t] <- 2*ave.count[x]
+			is.filled[a,t] <- TRUE
 		}
 	}
 
@@ -63,7 +66,7 @@ comp<- function(npairs, nfrags, nlibs, lambda=5, dispersion=0.05, winsorize=0.02
 	# Running the reference, and checking that the right number of low fragments are removed.
 	# Can't do it directly, because sorting might not be consistent between R and C++.
 	iters <- 50
-	test <- correctedContact(data, dispersion=dispersion, winsor=winsorize, ignore=discard, 
+	test <- correctedContact(data, winsor=winsorize, ignore=discard, 
 			iterations=iters, exclude.local=locality)
 	to.discard <- is.na(test$bias)
 	frag.sum <- rowSums(actual.mat) 
diff --git a/inst/tests/test-itercor.Rout.save b/inst/tests/test-itercor.Rout.save
@@ -1,6 +1,6 @@
 
-R Under development (unstable) (2014-12-14 r67167) -- "Unsuffered Consequences"
-Copyright (C) 2014 The R Foundation for Statistical Computing
+R version 3.2.0 (2015-04-16) -- "Full of Ingredients"
+Copyright (C) 2015 The R Foundation for Statistical Computing
 Platform: x86_64-unknown-linux-gnu (64-bit)
 
 R is free software and comes with ABSOLUTELY NO WARRANTY.
@@ -23,7 +23,7 @@ Type 'q()' to quit R.
 > suppressWarnings(suppressPackageStartupMessages(require(diffHic)))
 > suppressPackageStartupMessages(require(edgeR))
 > 	
-> comp<- function(npairs, nfrags, nlibs, lambda=5, dispersion=0.05, winsorize=0.02, discard=0.02, locality=1) {
+> comp<- function(npairs, nfrags, nlibs, lambda=5, winsorize=0.02, discard=0.02, locality=1) {
 + 	all.pairs <- rbind(t(combn(nfrags, 2)), cbind(1:nfrags, 1:nfrags))
 + 	all.pairs <- data.frame(anchor.id=all.pairs[,2], target.id=all.pairs[,1])	
 + 	npairs <- min(npairs, nrow(all.pairs))
@@ -37,16 +37,19 @@ Type 'q()' to quit R.
 + 	# Constructing the values.	
 + 	actual.mat<-matrix(0, nfrags, nfrags)
 + 	is.filled <- matrix(FALSE, nfrags, nfrags)
-+ 	ave.count <- exp(mglmOneGroup(counts, offset=numeric(nlibs), dispersion=dispersion))
++ 	ave.count <- exp(mglmOneGroup(counts, offset=numeric(nlibs)))
 + 	for (x in 1:nrow(data)) { 
 + 		if (ave.count[x] < 1e-6) { next } # As zeros get removed.
 + 		a<-data@anchors[x]
 + 		t<-data@targets[x]
-+ 		actual.mat[a,t] <- ave.count[x]
-+ 		is.filled[a,t] <- TRUE
 + 		if (a!=t) { 
++ 			actual.mat[a,t] <- ave.count[x]
++ 			is.filled[a,t] <- TRUE
 + 			actual.mat[t,a] <- ave.count[x] 
 + 			is.filled[t,a] <- TRUE
++ 		} else {
++ 			actual.mat[a,t] <- 2*ave.count[x]
++ 			is.filled[a,t] <- TRUE
 + 		}
 + 	}
 + 
@@ -82,7 +85,7 @@ Type 'q()' to quit R.
 + 	# Running the reference, and checking that the right number of low fragments are removed.
 + 	# Can't do it directly, because sorting might not be consistent between R and C++.
 + 	iters <- 50
-+ 	test <- correctedContact(data, dispersion=dispersion, winsor=winsorize, ignore=discard, 
++ 	test <- correctedContact(data, winsor=winsorize, ignore=discard, 
 + 			iterations=iters, exclude.local=locality)
 + 	to.discard <- is.na(test$bias)
 + 	frag.sum <- rowSums(actual.mat) 
@@ -195,12 +198,12 @@ Type 'q()' to quit R.
 > 
 > # Trying with no special attention.
 > comp(50, 20, 1, discard=0, winsor=0, locality=-1)
-[1] 14.785235  1.315092  6.763417 11.012796  7.758946  4.203576
+[1] 13.016640  1.346375  7.350337 10.195861  6.932015  4.199896
 > comp(50, 50, 1, discard=0, winsor=0, locality=-1)
-[1] 1.945631e-05 1.966686e+01 2.732566e-04 0.000000e+00 6.076916e-05
-[6] 6.514706e+00
+[1] 1.945077e-05 1.754996e+01 2.764390e-04 0.000000e+00 6.075174e-05
+[6] 7.745099e+00
 > comp(50, 20, 2, discard=0, winsor=0, locality=-1)
-[1] 6.769012 2.291995 2.505494 6.310373 3.699439 6.502240
+[1] 6.680250 3.036901 2.545979 6.006799 3.768523 6.211862
 > comp(50, 20, 2, discard=0, winsor=0, locality=1000)
 [1] 10.195305  2.220565 10.871631 10.509848  5.077755  3.727807
 > comp(100, 20, 2, discard=0, winsor=0, locality=1000)
@@ -211,4 +214,4 @@ Type 'q()' to quit R.
 > 
 > proc.time()
    user  system elapsed 
- 11.241   0.160  11.406 
+  6.499   0.128   6.639 
diff --git a/man/correctedContact.Rd b/man/correctedContact.Rd
@@ -7,7 +7,7 @@
 
 \usage{
 correctedContact(data, iterations=50, exclude.local=1, ignore.low=0.02, 
-    winsor.high=0.02, average=TRUE, dispersion=0.05)
+    winsor.high=0.02, average=TRUE, dist.correct=FALSE)
 }
 
 \arguments{
@@ -17,7 +17,7 @@ correctedContact(data, iterations=50, exclude.local=1, ignore.low=0.02,
 	\item{ignore.low}{a numeric scalar, indicating the proportion of low-abundance bins to ignore}
 	\item{winsor.high}{a numeric scalar indicating the proportion of high-abundance bin pairs to winsorize}
 	\item{average}{a logical scalar specifying whether counts should be averaged across libraries}
-	\item{dispersion}{a numeric scalar for use in computing the average count in \code{\link{mglmOneGroup}}}
+	\item{dist.correct}{a logical scalar indicating whether to correct for distance effects}
 }
 
 \value{
@@ -26,6 +26,7 @@ A list with several components.
 	\item{\code{truth}:}{a numeric vector containing the true interaction probabilities for each bin pair}
 	\item{\code{bias}:}{a numeric vector of biases for all bins}
     \item{\code{max}:}{a numeric vector containing the maximum fold-change change in biases at each iteration}
+	\item{\code{trend}:}{a numeric vector specifying the fitted value for the distance-dependent trend, if \code{dist.correct=TRUE}}
 }
 If \code{average=FALSE}, each component is a numeric matrix instead.
 Each column of the matrix contains the specified information for each library in \code{data}.
@@ -34,31 +35,39 @@ Each column of the matrix contains the specified information for each library in
 \details{
 This function implements the iterative correction procedure described by Imakaev \emph{et al.} in their 2012 paper. 
 Briefly, this aims to factorize the count for each bin pair into the bias for the anchor bin, the bias for the target bin and the true interaction probability. 
-The probability sums to 1 across all bin pairs for a given bin. 
 The bias represents the ease of sequencing/mapping/other for that genomic region.
 
 The \code{data} argument should be generated by taking the output of \code{\link{squareCounts}} after setting \code{filter=1}. 
 Filtering should be avoided as counts in low-abundance bin pairs may be informative upon summation for each bin.
 For example, a large count sum for a bin may be formed from many bin pairs with low counts.
-Removal of those bin pairs would result in the loss of per-bin information.
+Removal of those bin pairs would result in loss of information.
 
+For \code{average=TRUE}, if multiple libraries are used to generate \code{data}, an average count will be computed for each bin pairs across all libraries using \code{\link{mglmOneGroup}}.
+The average count will then be used for correction.
+Otherwise, correction will be performed on the counts for each library separately.
+
+The maximum step size in the output can be used as a measure of convergence. 
+Ideally, the step size should approach 1 as iterations pass. 
+This indicates that the correction procedure is converging to a single solution, as the maximum change to the computed biases is decreasing.
+}
+
+\section{Additional parameter settings}{
 Some robustness is provided by winsorizing out strong interactions with \code{winsor.high} to ensure that they do not overly influence the computed biases.
+This is useful for removing spikes around repeat regions or due to PCR duplication.
 Low-abundance bins can also be removed with \code{ignore.low} to avoid instability during correction, though this will result in \code{NA} values in the output.
 
 Local bin pairs can be excluded as these are typically irrelevant to long-range interactions. 
 They are also typically very high-abundance and may have excessive weight during correction, if not removed. 
 This can be done by removing all bin pairs where the difference between the anchor and target indices is less than \code{exclude.local}.
 
-For \code{average=TRUE}, if multiple libraries are used to generate \code{data}, an average count will be computed for each bin pairs across all libraries using \code{\link{mglmOneGroup}} with the specified \code{dispersion}.
-The average count will then be used for correction.
-Otherwise, correction will be performed on the counts for each library separately.
-
-The maximum step size in the output can be used as a measure of convergence. 
-Ideally, the step size should approach 1 as iterations pass. 
-This indicates that the correction procedure is converging to a single solution, as the maximum change to the computed biases is decreasing.
+If \code{dist.correct=TRUE}, abundances will be adjusted for distance-dependent effects.
+This is done by computing residuals from the fitted distance-abundance trend, using the \code{filterTrended} function.
+These residuals are then used for iterative correction, such that local interactions will not always have higher contact probabilities.
 
-% True signals are continuous variables and have limited use in count-based statistical frameworks.
-% You need to compute the bias for each one to get the offset.
+Ideally, the probability sums to unity across all bin pairs for a given bin (ignoring \code{NA} entries). 
+This is complicated by winsorizing of high-abundance interactions and removal of local interactions.
+These interactions are not involved in correction, but are still reported in the output \code{truth}.
+As a result, the sum may not equal unity, i.e., values are not strictly interpretable as probabilities.
 }
 
 \examples{
@@ -84,6 +93,23 @@ stuff <- correctedContact(data, average=FALSE)
 head(stuff$truth)
 head(stuff$bias)
 head(stuff$max)
+
+# Creating an offset matrix, for use in glmFit.
+anchor.bias <- stuff$bias[anchors(data, id=TRUE),]
+target.bias <- stuff$bias[targets(data, id=TRUE),]
+offsets <- log(anchor.bias * target.bias)
+difference <- log(stuff$truth) - (log(counts(data)) - offsets) # effective function of offset in GLMs.
+stopifnot(all(is.na(difference) | difference < 1e-8))
+
+# Adjusting for distance, and computing offsets with trend correction.
+stuff <- correctedContact(data, average=FALSE, dist.correct=TRUE)
+head(stuff$truth)
+head(stuff$trend)
+offsets <- log(stuff$bias[anchors(data, id=TRUE),]) +
+    log(stuff$bias[targets(data, id=TRUE),]) + 
+    stuff$trend/log2(exp(1))
+difference <- log(stuff$truth) - (log(counts(data)) - offsets)
+stopifnot(all(is.na(difference) | difference < 1e-8))
 }
 
 \author{Aaron Lun}
diff --git a/src/iterative_correction.cpp b/src/iterative_correction.cpp
@@ -110,10 +110,8 @@ try {
 	if (discarded > 0) {
 		for (int pr=0; pr<npairs; ++pr) {  // Computing the coverage.
 			if (ISNA(wptr[pr])) { continue; }
-			const int& cura=aptr[pr];
-			const int& curt=tptr[pr];
-			covptr[cura]+=wptr[pr]; 
-			if (cura!=curt) { covptr[curt]+=wptr[pr]; }
+			covptr[aptr[pr]]+=wptr[pr]; 
+			covptr[tptr[pr]]+=wptr[pr];
         }
 
 		int* ordering=new int [numfrags];
@@ -166,10 +164,8 @@ try {
 	for (int it=0; it<iterations; ++it) {
 		for (int pr=0; pr<npairs; ++pr) {  // Computing the coverage (ignoring locals, if necessary).
 			if (ISNA(wptr[pr])) { continue; }
-			const int& cura=aptr[pr];
-			const int& curt=tptr[pr];
-			covptr[cura]+=wptr[pr]; 
-			if (cura!=curt) { covptr[curt]+=wptr[pr]; }
+			covptr[aptr[pr]]+=wptr[pr]; 
+			covptr[tptr[pr]]+=wptr[pr];
         }
 		
 		/* Computing the 'aditional bias' vector, to take the mean across all fragments.
@@ -207,6 +203,16 @@ try {
 			}
 		}
     }
+
+	/* Recalculating the contact probabilities, using the estimated biases.
+	 * This gets normalized values for the Winsorized elements as well.
+	 * However, probabilities are no longer that, 
+	 */
+	if (todrop > 0) { 
+		for (int pr=0; pr<npairs; ++pr) {  
+			wptr[pr] = acptr[pr]/biaptr[aptr[pr]]/biaptr[tptr[pr]];
+    	}
+	}
 } catch (std::exception& e) {
 	UNPROTECT(1);
 	throw;
diff --git a/vignettes/diffHic.Rnw b/vignettes/diffHic.Rnw

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ filterDirect <- function(data, prior.count=2, reference=NULL)`
`35`	`35`	`# sized so treatable as bin pairs, but irregularly spaced).`
`36`	`36`	`{`
`37`	`37`	`out <- exptData(data)$width`
`38`		`- if (is.null(out)) { out <- median(regions(data)) }`
	`38`	`+ if (is.null(out)) { out <- median(width(regions(data))) }`
`39`	`39`	`return(out)`
`40`	`40`	`}`
`41`	`41`
`@@ -123,8 +123,11 @@ filterTrended <- function(data, span=0.25, prior.count=2, reference=NULL)`
`123`	`123`	`}`
`124`	`124`
`125`	`125`	`# Using the direct threshold.`
`126`		`- direct.threshold <- .getInterThreshold(seqnames(regions(data)), ave.ab[is.na(log.dist)], empty=empty)`
`127`		`- trend.threshold[is.na(log.dist)] <- direct.threshold`
	`126`	`+ is.inter <- is.na(log.dist)`
	`127`	`+ if (any(is.inter)) {`
	`128`	`+ direct.threshold <- .getInterThreshold(seqnames(regions(data)), ave.ab[is.inter], empty=empty)`
	`129`	`+ trend.threshold[is.inter] <- direct.threshold`
	`130`	`+ }`
`128`	`131`	`return(list(abundances=ave.ab, threshold=trend.threshold, log.distance=log.dist))`
`129`	`132`	`}`
`130`	`133`