-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsubsetLoci.R
114 lines (98 loc) · 5.93 KB
/
subsetLoci.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#SubsetLoci.R
#still some bugs - see line 21##
# Script to align RAD loci against a reference genome (via BLAT),
# subset for loci matching a selected chromosome (or chromosomes)
# and produce a new alignment of only those loci.
# Filters for blat hits are in the block beginning "#get an index of the locus # for z-chromosome loci"
# Need pyRAD outfiles including one .phy-formatted file and the .loci format. Currently set up to work with sbf1 cut site RAD data.
library(ggplot2);library(plyr)
setwd("/Users/fencelizard/Dropbox/vRAD3/")
#Reading in table of the number and size of chromosomes in the zebra finch - for optional plots of loci per chromosome. Ignore this for subsetting alignments.
z<- read.table("/Users/fencelizard/Documents/vRAD2/blat/zfinch_genome/zFinch_chrData.txt",header=TRUE)
z$Size <- as.numeric(as.character(z$Size))
z$ID <- factor(z$ID,as.character(z$ID))
#read in .loci file, find all bw loci breaks ("//"), for each locus read from the previous break +1 to the next break,
#randomly choose one sample's sequence, store that sequence, add back on the full cut site (CCTGCAGG),
#reformat as fasta, name the loci sequentially, and write them to a new file to use with BLAT.
#PROBLEMS: sometimes pyRAD doesn't cut the whole overhang and instead leaves the last "G" on the beginning of the
#sequence. Not clear why - it's not just loci with indels. Result is that adding the full cut site results
#in a mismatch or gap of at least 1, b/c the new seq has an extra "G" at the end of the cut site.
lociLines <- readLines("./pyRAD/outfiles/c2h8.loci")
endMarks <- grep("//",lociLines)
loci<- c()
j <- 1
for (i in 1:length(endMarks)){
locus <- lociLines[j:endMarks[i]]
j <- endMarks[i]+1
n <- as.integer(runif(1,1,length(locus)-1))
locus <- locus[n]
locus <- sub(" {2,}","\nCCTGCAGG",locus)
locus <- sub(">",paste(">locus",i,"_",sep=""),locus)
loci <- append(loci,locus)
}
write(loci,"./refAlign/seq/c2h8loci.fasta")
#run blat on the loci file (REMEMBER TO CHECK FILE PATHS FOR CORRECT FILES!)
system("blat /Users/fencelizard/Documents/vRAD2/blat/zfinch_genome/taeGut2.2bit /Users/fencelizard/Dropbox/vRAD3/refAlign/seq/c19h8loci.fasta /Users/fencelizard/Dropbox/vRAD3/refAlign/blatOutput/c19h8loci_zfAlign.pslx -t=dna -q=dna -out=pslx")
#get an index of the locus # for z-chromosome loci
a <- read.delim("./refAlign/blatOutput/c2h8loci_zfAlign.pslx",skip=5)
colnames(a) <- c("match","mis-match","rep.match","N's","Q.gap.count","Q.gap.bases","Tgap.count","Tgap.bases","strand","Q.name","Q.size","Q.start","Q.end","Tname","T.size","Tstart","T.end", "block.count","blockSizes","qStarts","tStarts","q.seq","t.seq")
a <- subset(a,a$match >=40)
duplicates <- subset(a,duplicated(a$Q.name))
a <- subset(a, !duplicated(a$Q.name))
a <- subset(a,a$Tname=="chrZ")
index <- as.character(a$Q.name)
index <- sapply(strsplit(index,"_"),'[[',1)
index <- sapply(strsplit(index,"s"),'[[',2)
index <- as.numeric(index)
######bar graphs for counts of loci by chromosome:
#a <- subset(a,a$Tname == "chr1"|a$Tname == "chr1A"|a$Tname == "chr1B"|a$Tname == "chr2"|a$Tname == "chr3"|a$Tname == "chr4"|a$Tname == "chr4A"|a$Tname == "chr5"|a$Tname == "chr6"|a$Tname == "chr7"|a$Tname == "chr8"|a$Tname == "chr9"|a$Tname == "chr10"|a$Tname == "chr11"|a$Tname == "chr12"|a$Tname == "chr13"|a$Tname == "chr14"|a$Tname == "chr15"|a$Tname == "chr16"|a$Tname == "chr17"|a$Tname == "chr18"|a$Tname == "chr19"|a$Tname == "chr20"|a$Tname == "chr21"|a$Tname == "chr22"|a$Tname == "chr23"|a$Tname == "chr24"|a$Tname == "chr25"|a$Tname == "chr26"|a$Tname == "chr27"|a$Tname == "chr28"|a$Tname == "chrZ"|a$Tname == "chrM")
#a$Tname <- as.character(a$Tname)
#a$Tname <- substr(a$Tname,4,nchar(a$Tname))
#a$min.samples <- "2"
#a$Tname <- factor(a$Tname,levels=c("1","1A","1B","2","3","4","4A","5",'6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','M','Z'))
#Loci <- c()
#for (i in levels(a$Tname)) {
# sub <- subset(a,a$Tname == i)
# rows <- nrow(sub)
# Loci <- append(Loci,rows)
#}
#df <- data.frame(z,Loci)
#df$loci.per.base <- df$Loci/df$Size
#quartz()
#ggplot()+theme_bw()+scale_fill_grey()+
# theme(axis.text.x=element_text(size=8),panel.grid.major = element_blank())+
# ylab("Aligned Loci")+xlab("Chromosome")+
# geom_bar(data=df,aes(x=ID,y=loci.per.base),stat="identity")
#Create empty data frame for alignment, subset out the z-loci, using the index vector from above to call the indices of the endMarks. Starting is hard: just have to input that.
#grab each locus, trim off the last line, replace spaces with a tab,
lociLines <- readLines("./pyRAD/outfiles/c2h8.loci")
endMarks <- grep("//",lociLines)
samples <- read.table("./pyRAD/outfiles/c2h8.unlinked_snps",skip=1,colClasses="character")[1]
alignment <- data.frame(samples,seq=factor("\t"))
colnames(alignment) <- c('sample','seq')
lociLength <- c()
## this returns a list of vectors alternating sample,seq for each of the z Loci
for (i in index){
locus <- lociLines[endMarks[i-1]:endMarks[i]]
locus <- locus[3:length(locus)-1]
locus <- gsub("* [-a-zA-Z]","\t",locus)
locus <- sub("*\t[-a-zA-Z]","\tCCTGCAGG",locus)
locus <- strsplit(locus,"\t")
locus <- do.call(rbind.data.frame, locus)
colnames(locus) <- c('sample','seq')
locus$seq <- as.character(locus$seq)
locus$sample <- as.character(locus$sample)
locus$sample <- gsub(">","",locus$sample)
locus$sample <- gsub("\t","",locus$sample)
locus$sample <- gsub(" ","",locus$sample)
lociLength <- append(lociLength,max(nchar(locus$seq)))
alignment <- merge(alignment,locus,by="sample",all.x=TRUE)
}
#outputs a df with sample names and sequences, one column per locus, single NA in for missing data for a locus
#loop for each column: 1. read the length of the sequence block and store it, 2. search and replace NA with N repeated as many times as the sequence is long
for (i in 3:ncol(alignment)){
a <- alignment[i]
a[is.na(a)] <- paste(rep("N",lociLength[i-2]),collapse="")
alignment[i] <-a
}
write.table(alignment,"zLoci_c2h8.phy")