-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathexample.r
75 lines (52 loc) · 2.8 KB
/
example.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# load the script (and ignore Rcpp warning message).
source("Apex2R.r")
# ------------------
# single study summary statistics
# ------------------
# Assume we have files from APEX called "chr20.cis_sumstats.txt.gz",
# "chr20.vcov.bin", and "chr20.vcov.idx.gz" in the working durectory.
prefix <- "chr20"
# This creates an object to query the files.
ss <- sumStats(prefix)
# ------------------
# meta-analysis summary statistics
# ------------------
# Now suppose we have these APEX files for each of studies A, B, and C.
system.time(ssm <- metaSumStats(
studyA ="chr20.A",
studyB ="chr20.B",
studyC = "chr20.C"
))
# genes present in all studies
i_genes <- Reduce(intersect, lapply(ssm$study_data, function(x) x$genes))
# ------------------
# example plotting & analysis
# ------------------
gene <- "ENSG00000101004"
## Get XtY and XtX objects for specified gene
gene_sf <- ssm$getSuffStats(gene)
## Get single-variant summary statistics for specified gene
gene_sm <- ssm$getSumStats(gene)
require(ggplot2)
# Find the top SNP for this gene
top_snp <- which.min(gene_sm$pval)
# Construct nice SNP labels, and calculate Rsq with top SNP
gene_sm[,`:=`(
snp = paste(chr,pos,ref,alt,sep='_'),
Rsq = (gene_sf$XtX[top_snp,]^2)/(diag(gene_sf$XtX) * gene_sf$XtX[top_snp,top_snp])
),]
# Make a little manhattan plot
ggplot(gene_sm, aes(x = pos/(1000000), y= log10(1/pval), colour = discr_rsq(Rsq))) + geom_point(size=0.75) + theme_minimal() + coord_cartesian(expand=FALSE,clip='off') + ggsci::scale_colour_locuszoom() + ylab(NULL) + xlab("Chromosome position") + theme(strip.placement = "outside", panel.spacing = unit(2, "lines")) + guides(colour = guide_legend(title = expression(italic(r)^2~"with lead SNP"), override.aes = list(shape = 15, size = 4))) + ggtitle(levels(factor(tmp_l$gene))[ii]) + theme(panel.spacing = unit(0.65, "lines"), axis.text=element_text(size=7, color = "black"), title = element_text(size = 9))
# function to convert correlation matrix to long table
getCorr <- function(x){
require(data.table)
data.table
out <- melt(data.table(as.data.frame(cov2cor(x)), keep.rownames = "snp1"), id.var = "snp1", variable.name = "snp2", value.name = "corr")
out$snp1 <- factor(out$snp1, levels = rownames(x), order = TRUE)
out$snp2 <- factor(out$snp2, levels = rownames(x), order = TRUE)
out
}
# Nearest 200 SNPs surrounding the top SNP
wk <- ( which.min(ss$pval) - 100 ):( which.min(ss$pval) + 100 )
# Plot the LD for SNPs in the top-hit region
p1 <- ggplot(getCorr(test[[1]]$XtX[wk,wk]), aes(x = snp1, y = snp2, fill = corr^2)) + geom_tile() +theme_void() + scale_fill_distiller(palette = "RdYlBu", limits = c(0,1)) + theme(legend.position = "none") + theme(plot.margin = unit(c(1,0.5,1,0.5)/5, "cm")) + ggtitle(ssm$studies[1])