Skip to content

Commit d007ca8

Browse files
Aaron LunAaron Lun
Aaron Lun
authored and
Aaron Lun
committed
Updated correctedContact, fixed filters.
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/diffHic@106535 bc3139a8-67e5-0310-9ffc-ced21a209358
1 parent 448c97d commit d007ca8

9 files changed

+120
-60
lines changed

DESCRIPTION

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: diffHic
2-
Version: 1.1.10
3-
Date: 2015/07/13
2+
Version: 1.1.11
3+
Date: 2015/07/18
44
Title: Differential analyis of Hi-C data
55
Author: Aaron Lun <alun@wehi.edu.au>
66
Maintainer: Aaron Lun <alun@wehi.edu.au>

R/correctedContact.R

+29-13
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
correctedContact <- function(data, iterations=50, exclude.local=1, ignore.low=0.02, winsor.high=0.02, average=TRUE, dispersion=0.05)
1+
correctedContact <- function(data, iterations=50, exclude.local=1, ignore.low=0.02, winsor.high=0.02,
2+
average=TRUE, dist.correct=FALSE)
23
# This performs the iterative correction method of Mirny et al. (2012) to
34
# identify the true contact probability of each patch of the interaction
45
# space. The idea is to use the true contact probability as a filter
@@ -9,20 +10,24 @@ correctedContact <- function(data, iterations=50, exclude.local=1, ignore.low=0.
910
#
1011
# written by Aaron Lun
1112
# some time ago
12-
# last modified 20 March 2015
13+
# last modified 18 July 2015
1314
{
1415
if (!average & ncol(data)>1L) {
15-
collected.truth <- collected.bias <- collected.max <- list()
16+
collected.truth <- collected.bias <- collected.max <- collected.trend <- list()
1617
for (lib in 1:ncol(data)) {
1718
out <- Recall(data[,lib], iterations=iterations, exclude.local=exclude.local, ignore.low=ignore.low,
18-
winsor.high=winsor.high, average=FALSE, dispersion=dispersion)
19+
winsor.high=winsor.high, average=FALSE, dist.correct=dist.correct)
1920
collected.truth[[lib]] <- out$truth
2021
collected.bias[[lib]] <- out$bias
2122
collected.max[[lib]] <- out$max
23+
collected.trend[[lib]] <- out$trend
2224
}
23-
return(list(truth=do.call(cbind, collected.truth),
25+
26+
output <- list(truth=do.call(cbind, collected.truth),
2427
bias=do.call(cbind, collected.bias),
25-
max=do.call(cbind, collected.max)))
28+
max=do.call(cbind, collected.max))
29+
if (dist.correct) { output$trend <- do.call(cbind, collected.trend) }
30+
return(output)
2631
}
2732

2833
# Checking arguments.
@@ -34,24 +39,35 @@ correctedContact <- function(data, iterations=50, exclude.local=1, ignore.low=0.
3439
if (winsor.high >= 1) { stop("proportion of high coverage interactions to winsorize should be less than 1") }
3540
exclude.local <- as.integer(exclude.local)
3641

37-
is.local <- !is.na(getDistance(data))
38-
log.lib <- log(data$totals)
39-
if (length(log.lib)>1L) {
40-
ave.counts <- exp(edgeR::mglmOneGroup(counts(data), offset=log.lib - mean(log.lib), dispersion=dispersion))
42+
# Computing average counts, with or without distance correction.
43+
if (dist.correct) {
44+
temp <- data
45+
temp$totals <- 1e6 # need library size to be reflected in fitted value of trend.
46+
trended <- filterTrended(temp, prior.count=0)
47+
ave.counts <- 2^(trended$abundance - trended$threshold)
48+
is.local <- !is.na(trended$log.distance)
4149
nzero <- !is.na(ave.counts)
4250
} else {
43-
nzero <- counts(data) != 0L
44-
ave.counts <- as.double(counts(data))
51+
is.local <- !is.na(getDistance(data))
52+
log.lib <- log(data$totals)
53+
if (length(log.lib)>1L) {
54+
ave.counts <- exp(edgeR::mglmOneGroup(counts(data), offset=log.lib - mean(log.lib)))
55+
nzero <- !is.na(ave.counts)
56+
} else {
57+
nzero <- counts(data) != 0L
58+
ave.counts <- as.double(counts(data))
59+
}
4560
}
4661

4762
out<-.Call(cxx_iterative_correction, ave.counts[nzero], anchors(data, id=TRUE)[nzero], targets(data, id=TRUE)[nzero],
4863
is.local[nzero], length(regions(data)), iterations, exclude.local, ignore.low, winsor.high)
4964
if (is.character(out)) { stop(out) }
50-
full.truth <- rep(NA, length(nzero))
65+
full.truth <- rep(0, length(nzero))
5166
full.truth[nzero] <- out[[1]]
5267
out[[1]] <- full.truth
5368

5469
names(out) <- c("truth", "bias", "max")
70+
if (dist.correct) { out$trend <- trended$threshold }
5571
return(out)
5672
}
5773

R/filters.R

+6-3
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ filterDirect <- function(data, prior.count=2, reference=NULL)
3535
# sized so treatable as bin pairs, but irregularly spaced).
3636
{
3737
out <- exptData(data)$width
38-
if (is.null(out)) { out <- median(regions(data)) }
38+
if (is.null(out)) { out <- median(width(regions(data))) }
3939
return(out)
4040
}
4141

@@ -123,8 +123,11 @@ filterTrended <- function(data, span=0.25, prior.count=2, reference=NULL)
123123
}
124124

125125
# Using the direct threshold.
126-
direct.threshold <- .getInterThreshold(seqnames(regions(data)), ave.ab[is.na(log.dist)], empty=empty)
127-
trend.threshold[is.na(log.dist)] <- direct.threshold
126+
is.inter <- is.na(log.dist)
127+
if (any(is.inter)) {
128+
direct.threshold <- .getInterThreshold(seqnames(regions(data)), ave.ab[is.inter], empty=empty)
129+
trend.threshold[is.inter] <- direct.threshold
130+
}
128131
return(list(abundances=ave.ab, threshold=trend.threshold, log.distance=log.dist))
129132
}
130133

inst/NEWS.Rd

+4-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
\title{diffHic News}
33
\encoding{UTF-8}
44

5-
\section{Version 1.1.10}{\itemize{
5+
\section{Version 1.1.11}{\itemize{
66
\item
77
Added library size specification to DIList methods normalize(), asDGEList().
88

@@ -33,6 +33,9 @@ Added compartmentalize() function to identify genomic compartments.
3333
\item
3434
Added domainDirections() function to help identify domains.
3535

36+
\item
37+
Modified correctedContact() to allow distance correction and report factorized probabilities directly.
38+
3639
\item
3740
Modified marginCounts() function for proper single-end-like treatment of Hi-C data.
3841

@@ -48,7 +51,6 @@ Modified consolidatePairs() to accept index vectors for greater modularity.
4851
\item
4952
Added reference argument for large bin pairs, in filterDirect() and filterTrended().
5053

51-
5254
\item
5355
Updated documentation, tests and user's guide.
5456
}}

inst/tests/test-itercor.R

+8-5
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
suppressWarnings(suppressPackageStartupMessages(require(diffHic)))
55
suppressPackageStartupMessages(require(edgeR))
66

7-
comp<- function(npairs, nfrags, nlibs, lambda=5, dispersion=0.05, winsorize=0.02, discard=0.02, locality=1) {
7+
comp<- function(npairs, nfrags, nlibs, lambda=5, winsorize=0.02, discard=0.02, locality=1) {
88
all.pairs <- rbind(t(combn(nfrags, 2)), cbind(1:nfrags, 1:nfrags))
99
all.pairs <- data.frame(anchor.id=all.pairs[,2], target.id=all.pairs[,1])
1010
npairs <- min(npairs, nrow(all.pairs))
@@ -18,16 +18,19 @@ comp<- function(npairs, nfrags, nlibs, lambda=5, dispersion=0.05, winsorize=0.02
1818
# Constructing the values.
1919
actual.mat<-matrix(0, nfrags, nfrags)
2020
is.filled <- matrix(FALSE, nfrags, nfrags)
21-
ave.count <- exp(mglmOneGroup(counts, offset=numeric(nlibs), dispersion=dispersion))
21+
ave.count <- exp(mglmOneGroup(counts, offset=numeric(nlibs)))
2222
for (x in 1:nrow(data)) {
2323
if (ave.count[x] < 1e-6) { next } # As zeros get removed.
2424
a<-data@anchors[x]
2525
t<-data@targets[x]
26-
actual.mat[a,t] <- ave.count[x]
27-
is.filled[a,t] <- TRUE
2826
if (a!=t) {
27+
actual.mat[a,t] <- ave.count[x]
28+
is.filled[a,t] <- TRUE
2929
actual.mat[t,a] <- ave.count[x]
3030
is.filled[t,a] <- TRUE
31+
} else {
32+
actual.mat[a,t] <- 2*ave.count[x]
33+
is.filled[a,t] <- TRUE
3134
}
3235
}
3336

@@ -63,7 +66,7 @@ comp<- function(npairs, nfrags, nlibs, lambda=5, dispersion=0.05, winsorize=0.02
6366
# Running the reference, and checking that the right number of low fragments are removed.
6467
# Can't do it directly, because sorting might not be consistent between R and C++.
6568
iters <- 50
66-
test <- correctedContact(data, dispersion=dispersion, winsor=winsorize, ignore=discard,
69+
test <- correctedContact(data, winsor=winsorize, ignore=discard,
6770
iterations=iters, exclude.local=locality)
6871
to.discard <- is.na(test$bias)
6972
frag.sum <- rowSums(actual.mat)

inst/tests/test-itercor.Rout.save

+15-12
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

2-
R Under development (unstable) (2014-12-14 r67167) -- "Unsuffered Consequences"
3-
Copyright (C) 2014 The R Foundation for Statistical Computing
2+
R version 3.2.0 (2015-04-16) -- "Full of Ingredients"
3+
Copyright (C) 2015 The R Foundation for Statistical Computing
44
Platform: x86_64-unknown-linux-gnu (64-bit)
55

66
R is free software and comes with ABSOLUTELY NO WARRANTY.
@@ -23,7 +23,7 @@ Type 'q()' to quit R.
2323
> suppressWarnings(suppressPackageStartupMessages(require(diffHic)))
2424
> suppressPackageStartupMessages(require(edgeR))
2525
>
26-
> comp<- function(npairs, nfrags, nlibs, lambda=5, dispersion=0.05, winsorize=0.02, discard=0.02, locality=1) {
26+
> comp<- function(npairs, nfrags, nlibs, lambda=5, winsorize=0.02, discard=0.02, locality=1) {
2727
+ all.pairs <- rbind(t(combn(nfrags, 2)), cbind(1:nfrags, 1:nfrags))
2828
+ all.pairs <- data.frame(anchor.id=all.pairs[,2], target.id=all.pairs[,1])
2929
+ npairs <- min(npairs, nrow(all.pairs))
@@ -37,16 +37,19 @@ Type 'q()' to quit R.
3737
+ # Constructing the values.
3838
+ actual.mat<-matrix(0, nfrags, nfrags)
3939
+ is.filled <- matrix(FALSE, nfrags, nfrags)
40-
+ ave.count <- exp(mglmOneGroup(counts, offset=numeric(nlibs), dispersion=dispersion))
40+
+ ave.count <- exp(mglmOneGroup(counts, offset=numeric(nlibs)))
4141
+ for (x in 1:nrow(data)) {
4242
+ if (ave.count[x] < 1e-6) { next } # As zeros get removed.
4343
+ a<-data@anchors[x]
4444
+ t<-data@targets[x]
45-
+ actual.mat[a,t] <- ave.count[x]
46-
+ is.filled[a,t] <- TRUE
4745
+ if (a!=t) {
46+
+ actual.mat[a,t] <- ave.count[x]
47+
+ is.filled[a,t] <- TRUE
4848
+ actual.mat[t,a] <- ave.count[x]
4949
+ is.filled[t,a] <- TRUE
50+
+ } else {
51+
+ actual.mat[a,t] <- 2*ave.count[x]
52+
+ is.filled[a,t] <- TRUE
5053
+ }
5154
+ }
5255
+
@@ -82,7 +85,7 @@ Type 'q()' to quit R.
8285
+ # Running the reference, and checking that the right number of low fragments are removed.
8386
+ # Can't do it directly, because sorting might not be consistent between R and C++.
8487
+ iters <- 50
85-
+ test <- correctedContact(data, dispersion=dispersion, winsor=winsorize, ignore=discard,
88+
+ test <- correctedContact(data, winsor=winsorize, ignore=discard,
8689
+ iterations=iters, exclude.local=locality)
8790
+ to.discard <- is.na(test$bias)
8891
+ frag.sum <- rowSums(actual.mat)
@@ -195,12 +198,12 @@ Type 'q()' to quit R.
195198
>
196199
> # Trying with no special attention.
197200
> comp(50, 20, 1, discard=0, winsor=0, locality=-1)
198-
[1] 14.785235 1.315092 6.763417 11.012796 7.758946 4.203576
201+
[1] 13.016640 1.346375 7.350337 10.195861 6.932015 4.199896
199202
> comp(50, 50, 1, discard=0, winsor=0, locality=-1)
200-
[1] 1.945631e-05 1.966686e+01 2.732566e-04 0.000000e+00 6.076916e-05
201-
[6] 6.514706e+00
203+
[1] 1.945077e-05 1.754996e+01 2.764390e-04 0.000000e+00 6.075174e-05
204+
[6] 7.745099e+00
202205
> comp(50, 20, 2, discard=0, winsor=0, locality=-1)
203-
[1] 6.769012 2.291995 2.505494 6.310373 3.699439 6.502240
206+
[1] 6.680250 3.036901 2.545979 6.006799 3.768523 6.211862
204207
> comp(50, 20, 2, discard=0, winsor=0, locality=1000)
205208
[1] 10.195305 2.220565 10.871631 10.509848 5.077755 3.727807
206209
> comp(100, 20, 2, discard=0, winsor=0, locality=1000)
@@ -211,4 +214,4 @@ Type 'q()' to quit R.
211214
>
212215
> proc.time()
213216
user system elapsed
214-
11.241 0.160 11.406
217+
6.499 0.128 6.639

man/correctedContact.Rd

+39-13
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
\usage{
99
correctedContact(data, iterations=50, exclude.local=1, ignore.low=0.02,
10-
winsor.high=0.02, average=TRUE, dispersion=0.05)
10+
winsor.high=0.02, average=TRUE, dist.correct=FALSE)
1111
}
1212

1313
\arguments{
@@ -17,7 +17,7 @@ correctedContact(data, iterations=50, exclude.local=1, ignore.low=0.02,
1717
\item{ignore.low}{a numeric scalar, indicating the proportion of low-abundance bins to ignore}
1818
\item{winsor.high}{a numeric scalar indicating the proportion of high-abundance bin pairs to winsorize}
1919
\item{average}{a logical scalar specifying whether counts should be averaged across libraries}
20-
\item{dispersion}{a numeric scalar for use in computing the average count in \code{\link{mglmOneGroup}}}
20+
\item{dist.correct}{a logical scalar indicating whether to correct for distance effects}
2121
}
2222

2323
\value{
@@ -26,6 +26,7 @@ A list with several components.
2626
\item{\code{truth}:}{a numeric vector containing the true interaction probabilities for each bin pair}
2727
\item{\code{bias}:}{a numeric vector of biases for all bins}
2828
\item{\code{max}:}{a numeric vector containing the maximum fold-change change in biases at each iteration}
29+
\item{\code{trend}:}{a numeric vector specifying the fitted value for the distance-dependent trend, if \code{dist.correct=TRUE}}
2930
}
3031
If \code{average=FALSE}, each component is a numeric matrix instead.
3132
Each column of the matrix contains the specified information for each library in \code{data}.
@@ -34,31 +35,39 @@ Each column of the matrix contains the specified information for each library in
3435
\details{
3536
This function implements the iterative correction procedure described by Imakaev \emph{et al.} in their 2012 paper.
3637
Briefly, this aims to factorize the count for each bin pair into the bias for the anchor bin, the bias for the target bin and the true interaction probability.
37-
The probability sums to 1 across all bin pairs for a given bin.
3838
The bias represents the ease of sequencing/mapping/other for that genomic region.
3939

4040
The \code{data} argument should be generated by taking the output of \code{\link{squareCounts}} after setting \code{filter=1}.
4141
Filtering should be avoided as counts in low-abundance bin pairs may be informative upon summation for each bin.
4242
For example, a large count sum for a bin may be formed from many bin pairs with low counts.
43-
Removal of those bin pairs would result in the loss of per-bin information.
43+
Removal of those bin pairs would result in loss of information.
4444

45+
For \code{average=TRUE}, if multiple libraries are used to generate \code{data}, an average count will be computed for each bin pairs across all libraries using \code{\link{mglmOneGroup}}.
46+
The average count will then be used for correction.
47+
Otherwise, correction will be performed on the counts for each library separately.
48+
49+
The maximum step size in the output can be used as a measure of convergence.
50+
Ideally, the step size should approach 1 as iterations pass.
51+
This indicates that the correction procedure is converging to a single solution, as the maximum change to the computed biases is decreasing.
52+
}
53+
54+
\section{Additional parameter settings}{
4555
Some robustness is provided by winsorizing out strong interactions with \code{winsor.high} to ensure that they do not overly influence the computed biases.
56+
This is useful for removing spikes around repeat regions or due to PCR duplication.
4657
Low-abundance bins can also be removed with \code{ignore.low} to avoid instability during correction, though this will result in \code{NA} values in the output.
4758

4859
Local bin pairs can be excluded as these are typically irrelevant to long-range interactions.
4960
They are also typically very high-abundance and may have excessive weight during correction, if not removed.
5061
This can be done by removing all bin pairs where the difference between the anchor and target indices is less than \code{exclude.local}.
5162

52-
For \code{average=TRUE}, if multiple libraries are used to generate \code{data}, an average count will be computed for each bin pairs across all libraries using \code{\link{mglmOneGroup}} with the specified \code{dispersion}.
53-
The average count will then be used for correction.
54-
Otherwise, correction will be performed on the counts for each library separately.
55-
56-
The maximum step size in the output can be used as a measure of convergence.
57-
Ideally, the step size should approach 1 as iterations pass.
58-
This indicates that the correction procedure is converging to a single solution, as the maximum change to the computed biases is decreasing.
63+
If \code{dist.correct=TRUE}, abundances will be adjusted for distance-dependent effects.
64+
This is done by computing residuals from the fitted distance-abundance trend, using the \code{filterTrended} function.
65+
These residuals are then used for iterative correction, such that local interactions will not always have higher contact probabilities.
5966

60-
% True signals are continuous variables and have limited use in count-based statistical frameworks.
61-
% You need to compute the bias for each one to get the offset.
67+
Ideally, the probability sums to unity across all bin pairs for a given bin (ignoring \code{NA} entries).
68+
This is complicated by winsorizing of high-abundance interactions and removal of local interactions.
69+
These interactions are not involved in correction, but are still reported in the output \code{truth}.
70+
As a result, the sum may not equal unity, i.e., values are not strictly interpretable as probabilities.
6271
}
6372

6473
\examples{
@@ -84,6 +93,23 @@ stuff <- correctedContact(data, average=FALSE)
8493
head(stuff$truth)
8594
head(stuff$bias)
8695
head(stuff$max)
96+
97+
# Creating an offset matrix, for use in glmFit.
98+
anchor.bias <- stuff$bias[anchors(data, id=TRUE),]
99+
target.bias <- stuff$bias[targets(data, id=TRUE),]
100+
offsets <- log(anchor.bias * target.bias)
101+
difference <- log(stuff$truth) - (log(counts(data)) - offsets) # effective function of offset in GLMs.
102+
stopifnot(all(is.na(difference) | difference < 1e-8))
103+
104+
# Adjusting for distance, and computing offsets with trend correction.
105+
stuff <- correctedContact(data, average=FALSE, dist.correct=TRUE)
106+
head(stuff$truth)
107+
head(stuff$trend)
108+
offsets <- log(stuff$bias[anchors(data, id=TRUE),]) +
109+
log(stuff$bias[targets(data, id=TRUE),]) +
110+
stuff$trend/log2(exp(1))
111+
difference <- log(stuff$truth) - (log(counts(data)) - offsets)
112+
stopifnot(all(is.na(difference) | difference < 1e-8))
87113
}
88114

89115
\author{Aaron Lun}

src/iterative_correction.cpp

+14-8
Original file line numberDiff line numberDiff line change
@@ -110,10 +110,8 @@ try {
110110
if (discarded > 0) {
111111
for (int pr=0; pr<npairs; ++pr) { // Computing the coverage.
112112
if (ISNA(wptr[pr])) { continue; }
113-
const int& cura=aptr[pr];
114-
const int& curt=tptr[pr];
115-
covptr[cura]+=wptr[pr];
116-
if (cura!=curt) { covptr[curt]+=wptr[pr]; }
113+
covptr[aptr[pr]]+=wptr[pr];
114+
covptr[tptr[pr]]+=wptr[pr];
117115
}
118116

119117
int* ordering=new int [numfrags];
@@ -166,10 +164,8 @@ try {
166164
for (int it=0; it<iterations; ++it) {
167165
for (int pr=0; pr<npairs; ++pr) { // Computing the coverage (ignoring locals, if necessary).
168166
if (ISNA(wptr[pr])) { continue; }
169-
const int& cura=aptr[pr];
170-
const int& curt=tptr[pr];
171-
covptr[cura]+=wptr[pr];
172-
if (cura!=curt) { covptr[curt]+=wptr[pr]; }
167+
covptr[aptr[pr]]+=wptr[pr];
168+
covptr[tptr[pr]]+=wptr[pr];
173169
}
174170

175171
/* Computing the 'aditional bias' vector, to take the mean across all fragments.
@@ -207,6 +203,16 @@ try {
207203
}
208204
}
209205
}
206+
207+
/* Recalculating the contact probabilities, using the estimated biases.
208+
* This gets normalized values for the Winsorized elements as well.
209+
* However, probabilities are no longer that,
210+
*/
211+
if (todrop > 0) {
212+
for (int pr=0; pr<npairs; ++pr) {
213+
wptr[pr] = acptr[pr]/biaptr[aptr[pr]]/biaptr[tptr[pr]];
214+
}
215+
}
210216
} catch (std::exception& e) {
211217
UNPROTECT(1);
212218
throw;

0 commit comments

Comments
 (0)