forked from laanwj/decuda
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDisass2.py
executable file
·807 lines (763 loc) · 29.3 KB
/
Disass2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
#!/usr/bin/python
# sm1_1 (G80) disassembler version 0.1 (decuda)
# Wladimir J. van der Laan <laanwj@gmail.com>, 2007
# Old, quirky disassembler
from cStringIO import StringIO
class CubinFile(object):
kernels = None
architecture = None
abiversion = None
modname = None
def __init__(self):
self.kernels = []
# sm1_1 atomic operations
atomic_ops = [
"iadd","exch","cas","fadd","inc","dec","imax","imin","fmax",
"fmin","and","or","xor","????","????","????"
]
# memory operand sizes
msize = [".u8",".s8",".u16",".s16",".64",".128",".u32",".s32"]
# logic operations (set)
logic_ops = [None,".lt",".eq",".le",".gt",".ne",".ge",None]
# op d subop 0 operations
d0_ops = [
"and","or","xor",None,None,None,None,None,
None,None,None,"not"
]
# parameter space
param_space = {
0x20: "%gridflags", # lower u16 is gridid
0x21: "%ntid.x", # checked
0x22: "%ntid.y",
0x23: "%ntid.z",
0x24: "%nctaid.x",
0x25: "%nctaid.y",
0x26: "%ctaid.x",
0x27: "%ctaid.y",
0x28: "%ctaid.z", # extrapolated
0x29: "%nctaid.y" # ptx ISA
# 0x60 start of "shared memory" as seen from app
# 0x64 start of actual program parameters
# What are the rest?
}
def lookup(x, y):
"""
Look up an entry in an array or hash, and return an
approciate response if it runs out of bounds or
there is a hole.
"""
try:
if x[y] != None:
return x[y]
except LookupError,e:
pass
return "?%i?" % y
class Kernel(object):
name = None
lmem = None # Amount of local mem used
smem = None # Amount of shared mem used
reg = None # Number of registers
bar = None # Number of barriers
bincode = None
def print_oper(self, rv, value, type):
if type == "r":
rv.write("$r%02x" % (value))
elif type == "pr":
rv.write("[$r%02x]" % (value))
elif type == "pro":
rv.write("[ofs+$r%02x]" % (value))
elif type == "pi":
rv.write("[%08x]" % (value))
elif type == "i":
rv.write("0x%08x" % (value))
elif type == "y": # parameter
try:
rv.write(param_space[value])
except LookupError:
rv.write("%%%02x" % (value))
elif type == "yo": # offset+parameter
rv.write("%%(ofs+%02x)" % (value))
elif type == "x": # constant (segment 0, used for global data)
rv.write("$c%02x" % (value))
elif type == "z": # constant (segment 1, used for segment specific data)
rv.write("$d%02x" % (value))
else:
rv.write("%02x" % (value))
def print_psize(self, rv, inst, fullinst):
if fullinst:
rv.write(".")
if (inst[1] & 0x08000000):
rv.write("s")
else:
rv.write("u")
if (inst[1] & 0x04000000):
rv.write("32")
else:
rv.write("16")
else:
rv.write(".")
if (inst[0] & 0x00008000):
rv.write("b32")
else:
rv.write("b16")
def print_psize2(self, rv, inst, fullinst):
# xx fullinst and print_psize2
if fullinst:
x = inst[1] & 0x00008000
else:
x = inst[0] & 0x00008000
rv.write(".")
if x:
rv.write("s32")
else:
rv.write("u32")
def disassemble(self):
"""Disassemble the cubin instructions in this kernel"""
rv = StringIO()
ptr = 0
while ptr < len(self.bincode):
base = ptr*4
inst = [self.bincode[ptr]]
ptr += 1
if inst[0] & 1:
inst.append(self.bincode[ptr])
ptr += 1
rv.write("%04x: %-17s " % (base, " ".join(["%08x" % x for x in inst])))
op = (inst[0]&0xF0000000)>>28
# Misc flags
flags = (inst[0]&0x0FC00000)>>22
if len(inst)==1:
subop = 0
twoword = False
else:
subop = (inst[1]&0xE0000000)>>29
twoword = True
if len(inst)==2 and (inst[1] & 3)==3:
immediate = True
else:
immediate = False
if inst[0] & 2:
system = True # flow control
else:
system = False
# Is the second instruction present?
fullinst = twoword and not immediate
if fullinst: # subsubop
lop = (inst[1]&0x001fc000)>>14
else:
lop = -1
# Changes some instructions
# Seems to choose an alternative instruction set, for example,
# add becomes sub. It is also used on mul24.lo.s32 sometimes
# but I have no idea what its effect is there.
alt = flags&1
print_msize = False
print_psize = False
print_psize2 = False
swap_oper = False
type_float = False
is_bra = False
oper1_t = "r" # Operand type u(unknown), r(reg), i(imm), p(pointer)
oper2_t = "r"
oper3_t = "r"
oper4_t = None
# Predication
if fullinst:
onpred = (inst[1]&0x00000780)>>7
pred = (inst[1]&0x00003000)>>12
if onpred == 15 or onpred == 0:
pass # No pred
elif onpred == 2:
# Execute on false
rv.write("@!p%i " % pred)
elif onpred == 5:
# Execute on true
rv.write("@p%i " % pred)
else:
# ??
rv.write("@?%i?p%i " % (onpred,pred))
# Print the main instruction
if system:
oper1_t = None
oper2_t = None
oper3_t = None
oper4_t = None
flags = 0
alt = 0
# Special instructions
if op == 0x1:
rv.write("bra")
is_bra = True
elif op == 0x2:
rv.write("call")
is_bra = True
elif op == 0x3:
rv.write("return")
elif op == 0x8:
rv.write("bar.sync")
# 0x0003xfff where x is the barrier id, 0-15
# The other bits are always like this...
elif op == 0x9:
rv.write("trap")
elif op == 0xA:
rv.write("join") # join point for divergent threads
is_bra = True
else:
rv.write("?%i?" % op)
elif op == 0x0:
if subop == 0x3: # Load internal value
rv.write("ldgpu")
if lop == 0:
rv.write(".physid")
elif lop == 1:
rv.write(".clock")
elif lop >= 4 and lop < 8:
rv.write(".pm%i" % (lop-4))
else:
rv.write(".??")
elif subop == 0x6:
rv.write("ld.offset1.shl")
oper3_t = "i"
elif subop == 0x7:
rv.write("st.shared")
# TODO: oper1 extends into oper3 for large values
oper4_t = "r"
oper3_t = None
# long offset in word 1
# 8 bit reads offset is *4
# 16 bit read offset is *2
# 32 bit reads offset is *1
# always loading at offset 60
type = (inst[1]&0x07E00000) >> 21
if type == 0:
rv.write(".b16")
elif type == 33:
rv.write(".b32")
elif type == 2:
rv.write(".b8")
else:
rv.write(".?%i?" % type)
if lop == 0:
# Absolute offset
rv.write(".abs")
elif lop == 1:
# Register offset
if offsetr: # Use offset register
oper2_t = "pro"
else:
oper2_t = "pr"
oper4_t = None
#print offsetr
#flags = 0 # flags contain something else?
else:
rv.write("????")
elif op == 0x1:
print_psize = True
if subop == 0x0:
if lop == 0xF or lop == -1:
rv.write("ld")
if immediate:
oper1_t = None
else:
oper2_t = "r"
oper3_t = None
else:
rv.write("?%i?" % lop)
elif subop == 0x1:
rv.write("ld.const") # Load const from offset into register
# TODO: oper1 extends into oper3 for large values
# TODO: use bit inst[1]&0x00400000 to determine segment to load from
if (inst[1]&0x00400000):
rv.write("1")
else:
rv.write("0")
oper4_t = None
oper3_t = None
oper1_t = "pi"
else:
rv.write("????")
elif op == 0x2:
print_psize = True
if subop == 0x0:
if alt:
alt = False # reset alt flag, as we know the effect in this case
rv.write("sub")
else:
rv.write("add")
if fullinst and (inst[1]&0x08000000):
# .s32 also signifies saturation
rv.write(".sat")
if fullinst:
oper4_t = "r"
oper3_t = None
else:
rv.write("????")
elif op == 0x3:
print_psize = True
if subop == 0: # seems to be a sub with arguments swapped
rv.write("neg")
if fullinst: # for some reason, the arguments for this instruction are oper1,oper2,oper4
oper3_t = None
oper4_t = "r"
elif subop == 3:
rv.write("set")
rv.write(lookup(logic_ops, lop))
elif subop == 4:
rv.write("max")
elif subop == 5:
rv.write("min")
elif subop == 6:
rv.write("shl")
elif subop == 7:
rv.write("shr")
else:
rv.write("????")
elif op == 0x4:
#print_psize = True
if subop == 0:
rv.write("mul24") # XXX hi/lo word
print_psize2 = True # based on bit 1 of lop
if lop == 0:
print_psize2 = False
rv.write(".u32.u16")
elif lop == 4 or lop == 6 or lop == -1: # no lop specified
rv.write(".lo")
elif lop == 5 or lop == 7:
rv.write(".hi")
else:
rv.write(".?%i?" % lop)
else:
rv.write("????")
elif op == 0x5:
if subop == 0:
rv.write("sad") # Sum of Absolute Differences
oper4_t = "r"
print_psize = True
else:
rv.write("????")
elif op == 0x6:
if subop == 0:
rv.write("mad24.u32.u16")
oper4_t = "r"
elif subop == 3:
rv.write("mad24.lo.u32")
oper4_t = "r"
elif subop == 4:
rv.write("mad24.lo.s32")
oper4_t = "r"
elif subop == 5:
rv.write("mad24.lo.sat.s32")
oper4_t = "r"
elif subop == 6:
rv.write("mad24.hi.u32")
oper4_t = "r"
elif subop == 7:
rv.write("mad24.hi.s32")
oper4_t = "r"
else:
rv.write("????")
# mad24.hi.sat.s32 runs over to op==0x7 subop==0x0, but I assume
# this is a ptxas bug, as this instruction makes no sense.
elif op == 0x9:
type_float = True
if subop == 0:
rv.write("rcp")
elif subop == 2:
rv.write("rsqrt")
elif subop == 3:
rv.write("lg2")
elif subop == 4:
rv.write("sin")
elif subop == 5:
rv.write("cos")
elif subop == 6:
rv.write("ex2")
else:
rv.write("????")
elif op == 0xa: # Conversion ops
oper3_t = None
if subop == 0:
rv.write("cvt")
types = [".u16",".u32",".u8","??",".s16",".s32",".s8","??"]
self.print_psize(rv, inst, fullinst)
rv.write(types[lop&7])
elif subop == 1: # Used in div implementation
rv.write("????")
elif subop == 2:
rv.write("cvt")
rops = [".rn",".rm",".rp",".rz"]
types = [".u16",".u32",".u8","??",".s16",".s32",".s8","??"]
rv.write(rops[lop>>3])
rv.write(".f32")
rv.write(types[lop&7])
elif subop == 3: # saturate or scale?
rv.write("sat.f32.u32") # conversion; int 0xFFFFFFFF to float 1.0f
elif subop == 4:
rv.write("cvt")
rops = [".rni",".rmi",".rpi",".rzi"]
if (lop&7)!=1:
rv.write("?") # Unknown conversion
rv.write(rops[lop>>3])
self.print_psize(rv, inst, fullinst)
rv.write(".f32")
elif subop == 6:
rops = [".rn",".rm",".rp",".rz"]
if (lop&7)!=1:
rv.write("?") # Unknown conversion
rv.write("cvt")
rv.write(lookup(rops, (lop>>3)&3))
if (inst[1]&0x08000000):
rv.write("i") # integer rounding mode
if (lop&32):
rv.write(".sat")
if (lop&64):
rv.write(".abs")
rv.write(".f32.f32")
elif subop == 7:
if lop == 1:
rv.write("neg")
type_float = True
else:
rv.write("????")
else:
rv.write("????")
elif op == 0xb:
type_float = True
if subop == 0:
# This determines the size of the operand (16 or 32 bit) for
# integer instructions. For floating point instructions it
# has a different meaning.
if fullinst:
sizebit = (inst[1]&0x08000000)
else:
sizebit = (inst[0]&0x00008000)
if sizebit and alt:
rv.write("????") # an add too?
elif alt:
rv.write("sub") # d,a,b -> d = a-b
elif sizebit:
rv.write("sub2") # arguments reversed d,a,b -> d = b-a
else:
rv.write("add")
if fullinst: # for some reason, the arguments for this instruction are oper1,oper2,oper4
oper3_t = None
oper4_t = "r"
#oper3_t = "i"
alt = False
elif subop == 3:
rv.write("set")
rv.write(lookup(logic_ops, lop))
elif subop == 4:
rv.write("max")
elif subop == 5:
rv.write("min")
elif subop == 6: # I don't know what this operation does, but is executed before sin, cos, and lg2
# Denormalize maybe?
rv.write("presin")
oper3_t = None
else:
rv.write("????")
elif op == 0xc:
if subop == 0:
type_float = True
rv.write("mul")
if lop == 0 or lop == -1:
rv.write(".rn")
elif lop == 3:
rv.write(".rz")
else:
rv.write(".??")
elif subop == 2:
oper4_t = "r"
rv.write("slct") # Select one of both arguments based on value of oper4
else:
rv.write("????")
elif op == 0xd:
if subop == 0x0:
ssop = (inst[1]&0x0000C000)>>14
print_psize = True
rv.write(lookup(d0_ops,lop))
elif subop == 0x1:
rv.write("ld.offset0")
# xx inst[0] contains the offset in the area oper1 and oper3 generally are
# oper5?
elif subop == 0x2:
print_msize = True
oper1_t = "pr"
oper2_t = "r"
oper3_t = None
rv.write("ld.local")
elif subop == 0x3:
print_msize = True
swap_oper = True
oper1_t = "pr"
oper2_t = "r"
oper3_t = None
rv.write("st.local")
elif subop == 0x4:
print_msize = True
oper1_t = "pr"
oper2_t = "r"
oper3_t = None
rv.write("ld.global")
elif subop == 0x5:
print_msize = True
oper1_t = "pr"
oper2_t = "r"
oper3_t = None
swap_oper = True # swap src and dest
rv.write("st.global")
elif subop == 0x6: # atomic, ignore output
rv.write("atom.global.")
rv.write(lookup(atomic_ops,(inst[1]&0x0000003C)>>2))
print_msize = True
elif subop == 0x7: # atomic, provide out reg
rv.write("atom.global.out.")
rv.write(lookup(atomic_ops,(inst[1]&0x0000003C)>>2))
print_msize = True
else:
rv.write("????")
elif op == 0xe:
if subop == 0x0:
rv.write("mad")
type_float = True
oper4_t = "r"
else:
rv.write("????")
elif op == 0xf:
if subop == 0x0:
rv.write("tex")
op = (inst[0]&0x03C00000)>>22
op2 = (inst[0]&0x04000000)
# 0 is also 1d, 1 is also 2d ...
if op == 0x8: # tex type
rv.write(".1d.f32.f32")
elif op == 0x9:
rv.write(".2d.f32.f32")
elif op == 0xa:
rv.write(".3d.f32.f32")
elif op == 0xc:
rv.write(".1d.f32.s32")
elif op == 0xe:
rv.write(".2d.f32.s32")
elif op == 0xf:
rv.write(".3d.f32.s32")
else:
rv.write("?%i?" % op)
if(op2): # Don't know what this bit does
rv.write(".?")
flags = 0 # flags are useless here as the bits contain something else
alt = 0
elif subop == 0x7:
rv.write("nop")
else:
rv.write("????")
else:
# unknown ops: 0x7, 0x8
rv.write("[op %01x subop %01x]" % (op,subop))
if fullinst and print_msize: # has second word, not immediate
opsize = (inst[1] & 0x00E00000)>>21
rv.write(msize[opsize])
# size and sign of data, if not immediate
if print_psize:
self.print_psize(rv, inst, fullinst)
if print_psize2:
self.print_psize2(rv, inst, fullinst)
if type_float:
rv.write(".f32")
rv.write(" ")
if alt:
rv.write("[alt] ")
# Extract and print operands
if oper3_t != None and (flags&2):
# operand 3 comes from constant (in segment 0)
oper3_t = "z"
if fullinst and (inst[1]&0x00400000):
#rv.write("[co3] ")
# operand 4 comes from constant in segment 1
if oper4_t != None:
oper4_t = "x"
elif oper3_t != None:
oper3_t = "x"
#if flags&4:
# # operand 1 comes from param
# #rv.write("[src from param?] ")
# oper1_t = "y" # from param
# oper1 += 0x30
# ld.offset flags 10 means:
# multiply src with 4
# offset from
if (flags&(~(4|2|1))):
rv.write("[flags 0x%02x] " % flags)
if fullinst: # Depending on this, source is 7 or 6 bits
oper1 = (inst[0]&0x0000FE00)>>9
oper2 = (inst[0]&0x000001FC)>>2
else:
oper1 = (inst[0]&0x00007E00)>>9
oper2 = (inst[0]&0x000001FC)>>2
if fullinst:
oper4 = (inst[1]&0x001fc000)>>14
else:
oper4 = None
# seems that bit 6 of src has another meaning when this is a immediate
# instruction (namely "32 bit")
if oper3_t != None and fullinst and (inst[1]&0x00100000):
# Operand 3 is immediate
oper3_t = "i"
#else:
# oper3_t = "r"
# Process parameter numbers in shared memory
if fullinst and (inst[1]&0x00200000) and not print_msize:
if (flags&0x10):
oper1_t = "yo" # from offset reg
else:
oper1_t = "y"
elif flags&4:
oper1_t = "yw"
#if oper1 >= 0x30: # Weird mapping for 32 bit
# oper1 += 0x30
#else:
# oper1 += 0x10
if immediate:
# Immediate data
oper3 = ((inst[1]&0x0FFFFFFC)<<4) | ((inst[0]&0x003F0000)>>16)
oper3_t = "i"
#rv.write("[imm %08x]" % imm)
else:
oper3 = ((inst[0]&0x003F0000)>>16)
#rv.write("[imm %02x]" % imm)
if swap_oper: # to, from
if oper1_t != None:
self.print_oper(rv, oper1, oper1_t)
if oper2_t != None:
rv.write(", ")
self.print_oper(rv, oper2, oper2_t)
else:
if oper2_t != None:
self.print_oper(rv, oper2, oper2_t)
if oper1_t != None:
rv.write(", ")
self.print_oper(rv, oper1, oper1_t)
if oper3_t != None:
rv.write(", ")
#if flipsrc2:
# rv.write("-")
self.print_oper(rv, oper3, oper3_t)
if oper4 != None and oper4_t != None:
rv.write(", ")
self.print_oper(rv, oper4, oper4_t)
if is_bra:
# Branch instruction, divide address for convenience
# I know the address is longer, as CUDA can address up to
# 2Mb of kernel instructions, but I have never been
# able to generate a kernel this big without crashing the
# ptxas. Probably, the higher part is in inst[1].
addr = (inst[0]&0x0FFFFE00)>>9
if addr&3:
rv.write("!nonaligned!")
rv.write("0x%08x" % (addr))
elif system:
addr = (inst[0]&0x0FFFFE00)>>9
rv.write("0x%08x " % (addr))
# Offset (integrate this into operand)
if flags&0x10:
offr = (flags&0x08)>>3
rv.write(" +ofs%i" % (offr))
# Predication (set)
if fullinst and (inst[1]&0x0000040):
pred = (inst[1]&0x00000030)>>4
rv.write(", p%i" % pred)
# Bits still unknown
# inst[0]:
# 0xF0000000 op
# 0x0FC00000 flags
# 0x04000000 use offset register
# 0x02000000 \- offset register 0/1
# 0x01000000 oper3 from constant
# 0x00800000 oper2 from parameter (offset 0x30)
# 0x00400000 alternative instruction, ie add->sub
# 0x003F0000 oper3 (full instruction)
# 0x001F0000 oper3 (short instruction or immediate)
# 0x0000FE00 oper2
# 0x000001FC oper1
# 0x00000002 0=normal, 1=system (flow control)
# 0x00000001 0=32bit, 1=64 bit
# inst[1]:
# 0xE0000000 subop
# 0x08000000 signed/unsigned
# 0x04000000 32/16
# 0x01000000 oper3 is immediate
# 0x00E00000 type, on load instructions
# 0x00400000 oper3/4 from constant in segment 1
# 0x00200000 oper2 from parameter
# 0x001FC000 oper4 or sub-sub op
# 0x00003000 predicate to act on
# 0x00000400 ?? (usually set, unless predicated)
# 0x00000200 execute on pred
# 0x00000100 execute on !pred
# 0x00000080 ?? (usually set, unless predicated)
# 0x00000040 set predicate
# 0x00000030 predicate to set
# 0x0000003C atomic op
# 0x00000003 marker (0=normal,1=end,2???,3=immediate)
# to find: predication
rv.write("\n")
return rv.getvalue()
class Dummy:
"""Dummy environment that absorbs environments we are not interested in"""
def extend(self, x):
pass
def load(name):
"""Load a cubin binary assembly file"""
f = open(name, "r")
ex = CubinFile()
inside = [ex]
while True:
line = f.readline()
if not line:
break
line = line[0:-1]
if line.strip() == "":
# Empty line
continue
closebrace = line.rfind("}")
openbrace = line.find("{")
equalpos = line.find("=")
if openbrace != -1:
cmd = line[0:openbrace].strip()
if closebrace != -1:
value = line[openbrace+1:closebrace]
setattr(inside[-1], cmd, value)
else:
#print cmd, "open"
if cmd == "code":
kernel = Kernel()
inside[-1].kernels.append(kernel)
inside.append(kernel)
elif cmd == "bincode":
inst = []
inside[-1].bincode = inst
inside.append(inst)
elif cmd == "consts" or cmd == "mem" or cmd == "sampler" or cmd == "const":
# Ignore
inside.append(Dummy())
else:
raise ValueError("Invalid environment %s" % cmd)
elif closebrace != -1:
#print inside[-1], "closed"
inside.pop()
elif equalpos != -1:
valname = line[0:equalpos].strip()
valvalue = line[equalpos+1:].strip()
setattr(inside[-1], valname, valvalue)
else:
# Bincode?
inst = line.strip().split(" ")
inst = [int(x,0) for x in inst]
inside[-1].extend(inst)
#print "inst", inst
return ex
if __name__ == "__main__":
import sys
cu = load(sys.argv[1])
print cu.kernels[0].disassemble()