-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathchromopainter2eigenstrat.py
93 lines (75 loc) · 2.98 KB
/
chromopainter2eigenstrat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# Convert a chromopainter (?) file to an eigenstrat file.
# The haplotype file is a file of alleles, one haplotype per columm,
# one site per row. The columns are ordered by donor first and then
# by recipient.
# usage: python chromopainter2eigenstrat.py -v vcf_file.vcf(.gz) -o out_root
# will generate out_root.[snp,ind,geno].
from __future__ import division
import sys, getopt, gdc, pdb
################################################################################
def parse_options():
"""
Options are described by the help() function
"""
options ={ "hap":None, "donor":None, "recipient":None, "out":"out" }
try:
opts, args = getopt.getopt(sys.argv[1:], "h:d:r:o:", ["hap", "out", "don", "rec"])
print opts, args
except Exception as err:
print str(err)
sys.exit()
for o, a in opts:
if o in ["-h","--hap"]: options["hap"] = a
if o in ["-d","--don"]: options["donor"] = a
if o in ["-r","--rec"]: options["recipient"] = a
elif o in ["-o","--out"]: options["out"] = a
print "found options:"
print options
return options
################################################################################
def write_ind_file(rec, don, ind):
"""
Make the .ind file out of the recipient and donor files.
"""
total_individuals=0
for what in [don, rec]:
for line in what:
if line=="\n":
next
pop,n=line[:-1].split()
n=int(n)
if n%2:
raise Exception("%s has an odd number of haplotypes"%(pop))
for i in range(n//2):
ind.write("%s%d\tU\t%s\n"%(pop, i, pop))
total_individuals+=1
return total_individuals
################################################################################
def main(options):
hap=gdc.open2(options["hap"])
don=gdc.open2(options["donor"])
rec=gdc.open2(options["recipient"])
snp, ind, geno = [open(options["out"]+x, "w") for x in [".snp", ".ind", ".geno"]]
total_individuals=write_ind_file(rec, don, ind)
removed={"multiallelic":0}
for line in hap:
bits=line[:-1].split()
chr,pos=bits[0:2]
gts=bits[2:]
alleles=list(set(gts))
if len(alleles)>2:
removed["multiallelic"]+=1
next
# Setting first allele to 0 - should check this is ok.
if len(gts)!=2*total_individuals:
raise Exception("N.genotypes!=N.individuals T %s %s"%(chr, pos))
snp.write("\t".join([chr+":"+pos, chr, "0.0", pos, alleles[0], alleles[1]])+"\n")
for i in range(total_individuals):
this_gt=sum([g==alleles[0] for g in gts[(i*2):(i*2+2)]])
geno.write(str(this_gt))
geno.write("\n")
[x.close() for x in [hap, don, rec, snp, ind, geno]]
################################################################################
if __name__=="__main__":
options=parse_options()
main(options)