Skip to content

Commit c70cfd6

Browse files
Aaron LunAaron Lun
Aaron Lun
authored and
Aaron Lun
committed
Added reference argument to filters, updated docs.
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/diffHic@105563 bc3139a8-67e5-0310-9ffc-ced21a209358
1 parent bbd2d67 commit c70cfd6

File tree

6 files changed

+110
-24
lines changed

6 files changed

+110
-24
lines changed

DESCRIPTION

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: diffHic
2-
Version: 1.1.8
3-
Date: 2015/06/21
2+
Version: 1.1.9
3+
Date: 2015/06/27
44
Title: Differential analyis of Hi-C data
55
Author: Aaron Lun <alun@wehi.edu.au>
66
Maintainer: Aaron Lun <alun@wehi.edu.au>

R/filters.R

Lines changed: 65 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,54 @@
1-
filterDirect <- function(data, ...)
1+
filterDirect <- function(data, prior.count=2, reference=NULL)
22
# Implements the direct filtering method on the abundances of
3-
# inter-chromosomal bin pairs.
3+
# inter-chromosomal bin pairs. Also allows for specification of
4+
# a reference set of bin pairs (usually larger bins from which
5+
# the abundances can be more stably computed).
46
#
57
# written by Aaron Lun
68
# created 5 March 2015
7-
# last modified 20 March 2015
9+
# last modified 24 June 2015
810
{
11+
if (!is.null(reference)) {
12+
actual.ab <- scaledAverage(asDGEList(data), prior.count=prior.count)
13+
ref <- Recall(reference, prior.count=prior.count)
14+
15+
stopifnot(identical(reference$totals, data$totals))
16+
scaling <- (.getBinSize(reference)/.getBinSize(data))^2
17+
adj.thresh <- .repriorAveLogCPM(ref$threshold, totals=data$totals,
18+
prior.count=prior.count, scaling=scaling)
19+
return(list(abundances=actual.ab, threshold=adj.thresh, ref=ref))
20+
}
21+
922
all.chrs <- seqnames(regions(data))
1023
is.inter <- as.logical(all.chrs[anchors(data, id=TRUE)]!=all.chrs[targets(data, id=TRUE)])
11-
ave.ab <- scaledAverage(asDGEList(data), ...)
24+
ave.ab <- scaledAverage(asDGEList(data), prior.count=prior.count)
1225

13-
threshold <- .getInterThreshold(all.chrs, ave.ab[is.inter], empty=.makeEmpty(data, ...))
26+
threshold <- .getInterThreshold(all.chrs, ave.ab[is.inter],
27+
empty=.makeEmpty(data, prior.count=prior.count))
1428
return(list(abundances=ave.ab, threshold=threshold))
1529
}
1630

17-
.getInterThreshold <- function(all.chrs, inter.ab, empty=NA) {
18-
# Getting the total number of inter-chromosomal bins.
31+
.getBinSize <- function(data)
32+
# Gets the bin size in base pairs. This should be easy for bin pairs,
33+
# but we also allow for more exotic set-ups, e.g., Capture-C loaded
34+
# with regionCounts where anchors are bins around probes (evenly
35+
# sized so treatable as bin pairs, but irregularly spaced).
36+
{
37+
out <- exptData(data)$width
38+
if (is.null(out)) { out <- median(regions(data)) }
39+
return(out)
40+
}
41+
42+
.getInterThreshold <- function(all.chrs, inter.ab, empty=NA)
43+
# Computes the threshold from inter-chromosomal interactions.
44+
# First we get the total number of inter-chromosomal bins,
45+
# and then we compue the median (accounting for those lost).
46+
{
1947
n.bins <- as.numeric(runLength(all.chrs))
2048
total.bins <- sum(n.bins)
2149
n.inter <- total.bins * (total.bins + 1L)/2L - sum(n.bins * (n.bins + 1L)/2L)
2250
prop.kept <- length(inter.ab)/n.inter
2351

24-
# Getting the threshold.
2552
if (prop.kept >= 1) {
2653
threshold <- median(inter.ab)
2754
} else if (prop.kept < 0.5) {
@@ -35,20 +62,43 @@ filterDirect <- function(data, ...)
3562

3663
.makeEmpty <- function(data, ...) { scaledAverage(DGEList(rbind(integer(ncol(data))), lib.size=data$totals), ...) }
3764

38-
filterTrended <- function(data, span=0.25, ...)
65+
.repriorAveLogCPM <- function(AveLogCPM, totals, prior.count, scaling)
66+
# Adjusting the average log-CPM to use a new prior count.
67+
{
68+
ave.count <- 2^AveLogCPM * mean(totals) / 1e6
69+
ave.count <- ave.count + prior.count*(scaling - 1)
70+
return(log2(ave.count * 1e6 / mean(totals) / scaling))
71+
}
72+
73+
filterTrended <- function(data, span=0.25, prior.count=2, reference=NULL)
3974
# Implements the trended filtering method on the abundances of
40-
# inter-chromosomal bin pairs.
75+
# inter-chromosomal bin pairs. Again, with allowances for a reference set.
4176
#
4277
# written by Aaron Lun
4378
# created 5 March 2015
44-
# last modified 20 March 2015
79+
# last modified 24 June 2015
4580
{
81+
if (!is.null(reference)) {
82+
actual.ab <- scaledAverage(asDGEList(data), prior.count=prior.count)
83+
actual.dist <- log10(getDistance(data, type="mid") + .getBinSize(data))
84+
ref <- Recall(reference, span=span, prior.count=prior.count)
85+
86+
new.threshold <- approx(x=ref$log.distance, y=ref$threshold, xout=actual.dist, rule=2)$y
87+
new.threshold[is.na(actual.dist)] <- ref$threshold[is.na(ref$log.distance)][1] # Direct threshold.
88+
89+
stopifnot(identical(reference$totals, data$totals))
90+
scaling <- (.getBinSize(reference)/.getBinSize(data))^2
91+
adj.thresh <- .repriorAveLogCPM(new.threshold, totals=data$totals,
92+
prior.count=prior.count, scaling=scaling)
93+
return(list(abundances=actual.ab, threshold=adj.thresh, log.distance=actual.dist, ref=ref))
94+
}
95+
4696
dist <- getDistance(data, type="mid")
47-
log.dist <- log10(dist + exptData(data)$width)
48-
ave.ab <- scaledAverage(asDGEList(data), ...)
97+
log.dist <- log10(dist + .getBinSize(data))
98+
ave.ab <- scaledAverage(asDGEList(data), prior.count=prior.count)
4999

50100
# Filling in the missing parts of the interaction space.
51-
empty <- .makeEmpty(data, ...)
101+
empty <- .makeEmpty(data, prior.count=prior.count)
52102
is.intra <- !is.na(log.dist)
53103
n.intras <- sum(is.intra)
54104
all.chrs <- seqnames(regions(data))
@@ -66,7 +116,7 @@ filterTrended <- function(data, span=0.25, ...)
66116
extra.dist <- .Call(cxx_get_missing_dist, cumsum(runLength(all.chrs)),
67117
a.pts-1L, t.pts-1L, (start(regions(data))+end(regions(data)))/2)
68118
if (is.character(extra.dist)) { stop(extra.dist) }
69-
extra.dist <- log10(extra.dist + exptData(data)$width)
119+
extra.dist <- log10(extra.dist + .getBinSize(data))
70120
trend.threshold <- loessFit(x=c(log.dist, extra.dist),
71121
y=c(ave.ab, rep(empty, length(extra.dist))),
72122
span=span)$fitted[1:length(log.dist)]

inst/NEWS.Rd

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
\title{diffHic News}
33
\encoding{UTF-8}
44

5-
\section{Version 1.1.7}{\itemize{
5+
\section{Version 1.1.9}{\itemize{
66
\item
77
Added library size specification to DIList methods normalize(), asDGEList().
88

@@ -42,6 +42,9 @@ Switched to reporting ranges directly from boxPairs(), added support for minimum
4242
\item
4343
Modified consolidatePairs() to accept index vectors for greater modularity.
4444

45+
\item
46+
Added reference argument for large bin pairs, in filterDirect() and filterTrended().
47+
4548
\item
4649
Updated documentation, tests and user's guide.
4750
}}

man/DNaseHiC.Rd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ Also, invalidity of chimeras is determined by checking whether the 3' end is mor
3737
The size of the pseudo-fragments is determined by, well, \code{size} in \code{segmentGenome}.
3838
Smaller sizes provide better resolution but increase computational work.
3939
Needless to say, the \code{param$fragments} field should contain the output from \code{segmentGenome}, rather than from \code{\link{cutGenome}}.
40+
Also see \code{\link{cutGenome}} documentation for a warning about the chromosome names.
4041

4142
Some loss of spatial resolution is inevitable when reads are summarized into pseudo-fragments.
4243
This is largely irrelevant, though, as counting across the interaction space will ultimately use much larger bins (usually at least 2 kbp).

man/cutGenome.Rd

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,13 @@ Nonetheless, they are still reported to maintain the correspondence between frag
3333
Cleavage sites on the forward strand can be obtained as the \code{start} locations of all fragments (excepting the first fragment on each chromosome).
3434
}
3535

36+
\section{Warning}{
37+
If \code{bs} is a FASTQ file, the chromosome names in the FASTQ headers will be loaded faithfully by \code{cutGenome}.
38+
However, many mapping pipelines will drop the rest of the name past the first whitespace when constructing the alignment index.
39+
To be safe, users should ensure that the chromosome names in the FASTQ headers consist of one word.
40+
Otherwise, there will be a discrepancy between the chromosome names in the output \code{GRanges}, and those in the BAM files after alignment.
41+
}
42+
3643
% Interpretations of consecutive sites is generally tricky.
3744
% For starters, the 'remainder' is so low that the strands are unlikely to stay stuck together until the fill-in step.
3845
% This becomes an impossibility if remainder is zero, such that ssDNA is formed after cleavage of consecutive sites.

man/filters.Rd

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,15 @@
66
\description{Implementations of the direct and trended filtering strategies for bin pair abundances.}
77

88
\usage{
9-
filterDirect(data, ...)
10-
filterTrended(data, span=0.25, ...)
9+
filterDirect(data, prior.count=2, reference=NULL)
10+
filterTrended(data, span=0.25, prior.count=2, reference=NULL)
1111
}
1212

1313
\arguments{
1414
\item{data}{a \code{DIList} object produced by \code{\link{squareCounts}}}
1515
\item{span}{a numeric scalar specifying the bandwidth for loess curve fitting}
16-
\item{...}{other arguments to be passed to \code{\link{scaledAverage}}}
16+
\item{prior.count}{a numeric scalar indicating the prior count to use for calculating the average abundance}
17+
\item{reference}{another \code{DIList} object, usually containing data for larger bin pairs}
1718
}
1819

1920
\details{
@@ -32,14 +33,20 @@ Lower values may need to be used for a more accurate fit when the trend is highl
3233
The bin size is also added to the distance prior to log-transformation, to avoid problems with undefined values when distances are equal to zero.
3334
Empty parts of the interaction space are considered by inferring the abundances and distances of the corresponding bin pairs (though this is skipped if too much of the space is empty).
3435

35-
The \code{scale} argument can be passed to \code{\link{scaledAverage}}, in order to handle comparisons between bin pairs of different sizes.
36-
Check out the user's guide for more details.
36+
If \code{reference} is specified, it will be used to compute filter thresholds instead of \code{data}.
37+
This is intended for large bin pairs that have been loaded with \code{filter=1}.
38+
Larger bins provide larger counts for more precise threshold estimates, while the lack of filtering ensures that estimates are not biased.
39+
All threshold estimates are adjusted to account for differences in bin sizes between \code{reference} and \code{data}.
40+
The final values can be used to directly filter on abundances in \code{data}; check out the user's guide for more details.
3741
}
3842
3943
\value{
4044
A list is returned containing \code{abundances}, a numeric vector with the average abundances of all bin pairs in \code{data}.
4145
For \code{filterDirect}, the list contains a numeric scalar \code{threshold}, i.e., the non-specific ligation rate.
42-
For \code{filterTrended}, the list contains \code{threshold}, a numeric vector containing the threshold for each bin pair; and \code{log.distances}, a numeric vector with the log-distances for each bin pair.
46+
For \code{filterTrended}, the list contains \code{threshold}, a numeric vector containing the threshold for each bin pair; and \code{log.distance}, a numeric vector with the log-distances for each bin pair.
47+
48+
If \code{reference} is specified in either function, an additional list named \code{ref} is also returned.
49+
This contains the filtering information for the bin pairs in \code{reference}, same as that reported above for each bin pair in \code{data}.
4350
}
4451
4552
\seealso{
@@ -73,6 +80,24 @@ y[keep,]
7380
trended <- filterTrended(y)
7481
keep <- trended$abundances > trended$threshold
7582
y[keep,]
83+
84+
# Running reference comparisons, using larger bin pairs.
85+
w <- 5L
86+
a2 <- a/w
87+
b2 <- b/w
88+
regions2 <- GRanges(rep(c("chrA", "chrB"), c(a2, b2)),
89+
IRanges(c(1:a2, 1:b2)*w-w+1L, c(1:a2, 1:b2)*w))
90+
npairs2 <- 20
91+
all.anchors2 <- sample(length(regions2), npairs2, replace=TRUE)
92+
all.targets2 <- as.integer(runif(npairs2, 1, all.anchors2+1))
93+
y2 <- DIList(matrix(rnbinom(npairs2*4, mu=10*w^2, size=10), npairs2, 4),
94+
anchors=all.anchors2, targets=all.targets2, regions=regions2,
95+
totals=y$totals, exptData=List(width=w))
96+
97+
direct2 <- filterDirect(y, reference=y2)
98+
sum(direct2$abundances > direct2$threshold + log2(1.5))
99+
trended2 <- filterTrended(y, reference=y2)
100+
sum(trended2$abundances > trended2$threshold)
76101
}
77102
78103
\references{

0 commit comments

Comments
 (0)