-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplit_fasta_by_sam.py
46 lines (35 loc) · 995 Bytes
/
split_fasta_by_sam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
"""Split reads in a fasta file according to reference sequences they have been mappend in a SAM file"""
import sys
from Bio import SeqIO
from collections import Counter
MAXsequences = 100
SAMish, FASTA = sys.argv[1:]
partFile = open(SAMish)
groups = {}
for line in partFile:
if line.startswith("@"):
continue
line = line.split("\t")
if line[2] in groups:
exclude.add(line[2])
groups[line[2]] = None
else:
groups[line[2]] = line[0]
for r in exclude:
print "Excluding ambiguous read", r
del (groups[r])
outIO = {}
group_counter = Counter()
for grp in set(groups.values() + ["rest"]):
fname = "%s.fasta" % (grp)
fname = re.sub("[^a-zA-Z0-9.-]", "_", fname)
o = open(fname, "w")
outIO[grp] = o
for inFasta in SeqIO.parse(FASTA, "fasta"):
g = groups.get(inFasta.id, "rest")
o = outIO[g]
SeqIO.write(inFasta, o, "fasta")
group_counter[g] += 1
for o in outIO.values():
o.close()
print group_counter